From 937eae3a34c94998eae4bcaf0525aaab609726df Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 12 Apr 2026 13:44:05 +0800 Subject: [PATCH 01/21] draft gguf remove Signed-off-by: Isotr0py --- .github/dependabot.yml | 1 - .pre-commit-config.yaml | 2 +- CMakeLists.txt | 1 - csrc/ops.h | 24 +- csrc/quantization/gguf/dequantize.cuh | 571 ------ csrc/quantization/gguf/ggml-common.h | 1150 ----------- csrc/quantization/gguf/gguf_kernel.cu | 542 ----- csrc/quantization/gguf/mmq.cuh | 610 ------ csrc/quantization/gguf/mmvq.cuh | 212 -- csrc/quantization/gguf/moe.cuh | 739 ------- csrc/quantization/gguf/moe_vec.cuh | 338 --- csrc/quantization/gguf/vecdotq.cuh | 1812 ----------------- csrc/torch_bindings.cpp | 33 - docs/features/quantization/README.md | 2 - docs/features/quantization/gguf.md | 87 - docs/mkdocs/hooks/generate_examples.py | 1 - requirements/common.txt | 1 - requirements/test/rocm.txt | 8 - tests/compile/fullgraph/test_full_graph.py | 6 - tests/kernels/quantization/test_ggml.py | 54 - tests/kernels/quantization/test_gguf.py | 207 -- .../generation/test_multimodal_gguf.py | 180 -- tests/models/quantization/test_gguf.py | 204 -- tests/models/test_gguf_download.py | 221 -- tests/transformers_utils/test_utils.py | 210 -- vllm/_custom_ops.py | 128 -- vllm/config/load.py | 2 - vllm/config/model.py | 26 +- vllm/engine/arg_utils.py | 10 +- vllm/model_executor/layers/fused_moe/layer.py | 13 +- vllm/model_executor/layers/linear.py | 72 +- .../layers/quantization/__init__.py | 3 - .../layers/quantization/base_config.py | 26 + .../layers/quantization/gguf.py | 691 ------- .../layers/vocab_parallel_embedding.py | 6 +- vllm/model_executor/model_loader/__init__.py | 4 - .../model_loader/gguf_loader.py | 436 ---- .../model_loader/weight_utils.py | 168 +- vllm/model_executor/models/apertus.py | 7 +- vllm/model_executor/models/exaone.py | 6 +- vllm/model_executor/models/exaone4.py | 6 +- vllm/model_executor/models/gemma3.py | 12 +- vllm/model_executor/models/jais2.py | 6 +- vllm/model_executor/models/llama.py | 7 +- vllm/model_executor/models/llama4.py | 7 +- vllm/model_executor/models/openpangu.py | 21 +- vllm/model_executor/models/siglip.py | 15 +- vllm/model_format.py | 162 ++ vllm/platforms/rocm.py | 1 - vllm/tokenizers/registry.py | 31 +- vllm/transformers_utils/config.py | 136 +- vllm/transformers_utils/gguf_utils.py | 336 --- vllm/transformers_utils/processor.py | 20 +- vllm/v1/metrics/perf.py | 1 - 54 files changed, 359 insertions(+), 9216 deletions(-) delete mode 100644 csrc/quantization/gguf/dequantize.cuh delete mode 100644 csrc/quantization/gguf/ggml-common.h delete mode 100644 csrc/quantization/gguf/gguf_kernel.cu delete mode 100644 csrc/quantization/gguf/mmq.cuh delete mode 100644 csrc/quantization/gguf/mmvq.cuh delete mode 100644 csrc/quantization/gguf/moe.cuh delete mode 100644 csrc/quantization/gguf/moe_vec.cuh delete mode 100644 csrc/quantization/gguf/vecdotq.cuh delete mode 100644 docs/features/quantization/gguf.md delete mode 100644 tests/kernels/quantization/test_ggml.py delete mode 100644 tests/kernels/quantization/test_gguf.py delete mode 100644 tests/models/multimodal/generation/test_multimodal_gguf.py delete mode 100644 tests/models/quantization/test_gguf.py delete mode 100644 tests/models/test_gguf_download.py delete mode 100644 vllm/model_executor/layers/quantization/gguf.py delete mode 100644 vllm/model_executor/model_loader/gguf_loader.py create mode 100644 vllm/model_format.py delete mode 100644 vllm/transformers_utils/gguf_utils.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml index a017d69be991..944929fc55e5 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -21,7 +21,6 @@ updates: - dependency-name: "torchvision" - dependency-name: "xformers" - dependency-name: "lm-format-enforcer" - - dependency-name: "gguf" - dependency-name: "compressed-tensors" - dependency-name: "ray[cgraph]" # Ray Compiled Graph - dependency-name: "lm-eval" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 33b1db69dec4..6896b4494f27 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: rev: v21.1.2 hooks: - id: clang-format - exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' + exclude: 'csrc/moe/topk_softmax_kernels.cu|vllm/third_party/.*' types_or: [c++, cuda] args: [--style=file, --verbose] - repo: https://github.com/DavidAnson/markdownlint-cli2 diff --git a/CMakeLists.txt b/CMakeLists.txt index f24c12eff83c..d22252f69280 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -300,7 +300,6 @@ set(VLLM_EXT_SRC "csrc/quantization/w8a8/fp8/common.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/fused_kernels/fused_silu_mul_block_quant.cu" - "csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/activation_kernels.cu" "csrc/cuda_utils_kernels.cu" "csrc/custom_all_reduce.cu" diff --git a/csrc/ops.h b/csrc/ops.h index da066512b7c1..2ef62b801334 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -203,28 +203,6 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel, #endif -torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, - int64_t n, - std::optional const& dtype); - -torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, - int64_t type, int64_t row); - -torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, - int64_t row); - -torch::Tensor ggml_moe_a8(torch::Tensor X, torch::Tensor W, - torch::Tensor sorted_token_ids, - torch::Tensor expert_ids, - torch::Tensor num_tokens_post_padded, int64_t type, - int64_t row, int64_t top_k, int64_t tokens); - -torch::Tensor ggml_moe_a8_vec(torch::Tensor X, torch::Tensor W, - torch::Tensor topk_ids, int64_t top_k, - int64_t type, int64_t row, int64_t tokens); - -int64_t ggml_moe_get_block_size(int64_t type); - void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor const& scale, std::optional const& azp); @@ -320,4 +298,4 @@ std::tuple minimax_allreduce_rms_qk( torch::Tensor const& norm_weight_k, torch::Tensor workspace, int64_t const q_size, int64_t const kv_size, int64_t const rank, int64_t const nranks, double const eps); -#endif \ No newline at end of file +#endif diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh deleted file mode 100644 index 9d355003ef91..000000000000 --- a/csrc/quantization/gguf/dequantize.cuh +++ /dev/null @@ -1,571 +0,0 @@ -// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/convert.cu -// Dequant functions -static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q4_0 * x = (const block_q4_0 *) vx; - - const dfloat d = x[ib].d; - - const int vui = x[ib].qs[iqs]; - - v.x = __int2half_rn(vui & 0xF); - v.y = __int2half_rn(vui >> 4); - - v = __hsub2(v, __floats2half2_rn(8.0f, 8.0f)); - v = __hmul2(v, {d, d}); -} - -static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q4_1 * x = (const block_q4_1 *) vx; - - const dfloat d = __low2half(x[ib].dm); - const dfloat m = __high2half(x[ib].dm); - - const int vui = x[ib].qs[iqs]; - - v.x = __int2half_rn(vui & 0xF); - v.y = __int2half_rn(vui >> 4); - - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); -} - -static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q5_0 * x = (const block_q5_0 *) vx; - - const dfloat d = x[ib].d; - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0); - v.y = __int2half_rn((x[ib].qs[iqs] >> 4) | xh_1); - - v = __hsub2(v, __floats2half2_rn(16.0f, 16.0f)); - v = __hmul2(v, {d, d}); -} - -static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q5_1 * x = (const block_q5_1 *) vx; - - const dfloat d = __low2half(x[ib].dm); - const dfloat m = __high2half(x[ib].dm); - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0); - v.y = __int2half_rn((x[ib].qs[iqs] >> 4) | xh_1); - - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); -} - -static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q8_0 * x = (const block_q8_0 *) vx; - - const dfloat d = x[ib].d; - - v.x = __int2half_rn(x[ib].qs[iqs + 0]); - v.y = __int2half_rn(x[ib].qs[iqs + 1]); - - v = __hmul2(v, {d, d}); -} - -template -static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { - const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x); - - if (i >= k) { - return; - } - - const int ib = i/qk; // block index - const int iqs = (i%qk)/qr; // quant index - const int iybs = i - i%qk; // y block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(vx, ib, iqs, v); - - y[iybs + iqs + 0] = convert_from_half(v.x); - y[iybs + iqs + y_offset] = convert_from_half(v.y); -} - -template -static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const auto i = blockIdx.x; - const block_q2_K * x = (const block_q2_K *) vx; - - const auto tid = threadIdx.x; - const int n = tid/32; - const int l = tid - 32*n; - const int is = 8*n + l/16; - - const uint8_t q = x[i].qs[32*n + l]; - dst_t * y = yy + i*QK_K + 128*n; - - half dall = __low2half(x[i].dm); - half dmin = __high2half(x[i].dm); - y[l+ 0] = convert_from_half(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4)))); - y[l+32] = convert_from_half(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4)))); - y[l+64] = convert_from_half(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4)))); - y[l+96] = convert_from_half(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4)))); -} - -template -static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const auto i = blockIdx.x; - const block_q3_K * x = (const block_q3_K *) vx; - - const auto r = threadIdx.x/4; - const int tid = r/2; - const int is0 = r%2; - const int l0 = 16*is0 + 4*(threadIdx.x%4); - const int n = tid / 4; - const int j = tid - 4*n; - - uint8_t m = 1 << (4*n + j); - int is = 8*n + 2*j + is0; - int shift = 2*j; - - int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : - is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : - is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : - (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); - half d_all = x[i].d; - half dl = __hmul(d_all, __int2half_rn(us - 32)); - - dst_t * y = yy + i*QK_K + 128*n + 32*j; - const uint8_t * q = x[i].qs + 32*n; - const uint8_t * hm = x[i].hmask; - - for (int l = l0; l < l0+4; ++l) { - y[l] = convert_from_half(__hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)))); - } -} - -static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { - if (j < 4) { - d = q[j] & 63; m = q[j + 4] & 63; - } else { - d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - } -} - -template -static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const block_q4_K * x = (const block_q4_K *) vx; - - const auto i = blockIdx.x; - - // assume 32 threads - const auto tid = threadIdx.x; - const int il = tid/8; - const int ir = tid%8; - const int is = 2*il; - const int n = 4; - - dst_t * y = yy + i*QK_K + 64*il + n*ir; - - const half dall = __low2half(x[i].dm); - const half dmin = __high2half(x[i].dm); - - const uint8_t * q = x[i].qs + 32*il + n*ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const half d1 = __hmul(dall, __int2half_rn(sc)); - const half m1 = __hmul(dmin, __int2half_rn(m)); - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const half d2 = __hmul(dall, __int2half_rn(sc)); - const half m2 = __hmul(dmin, __int2half_rn(m)); - for (int l = 0; l < n; ++l) { - y[l + 0] = convert_from_half(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1)); - y[l +32] = convert_from_half(__hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2)); - } -} - -template -static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const block_q5_K * x = (const block_q5_K *) vx; - - const auto i = blockIdx.x; - - // assume 64 threads - this is very slightly better than the one below - const auto tid = threadIdx.x; - const int il = tid/16; // il is in 0...3 - const int ir = tid%16; // ir is in 0...15 - const int is = 2*il; // is is in 0...6 - - dst_t * y = yy + i*QK_K + 64*il + 2*ir; - - const half dall = __low2half(x[i].dm); - const half dmin = __high2half(x[i].dm); - - const uint8_t * ql = x[i].qs + 32*il + 2*ir; - const uint8_t * qh = x[i].qh + 2*ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const half d1 = __hmul(dall, __int2half_rn(sc)); const half m1 = __hmul(dmin, __int2half_rn(m)); - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m)); - - uint8_t hm = 1 << (2*il); - y[ 0] = convert_from_half(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1)); - y[ 1] = convert_from_half(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1)); - hm <<= 1; - y[32] = convert_from_half(__hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2)); - y[33] = convert_from_half(__hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2)); -} - -template -static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const block_q6_K * x = (const block_q6_K *) vx; - - const auto i = blockIdx.x; - - // assume 64 threads - this is very slightly better than the one below - const auto tid = threadIdx.x; - const int ip = tid/32; // ip is 0 or 1 - const int il = tid - 32*ip; // 0...32 - const int is = 8*ip + il/16; - - dst_t * y = yy + i*QK_K + 128*ip + il; - - const half d = x[i].d; - - const uint8_t * ql = x[i].ql + 64*ip + il; - const uint8_t qh = x[i].qh[32*ip + il]; - const int8_t * sc = x[i].scales + is; - - y[ 0] = convert_from_half(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)))); - y[32] = convert_from_half(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)))); - y[64] = convert_from_half(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)))); - y[96] = convert_from_half(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)))); -} - -template -static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const auto i = blockIdx.x; - const block_iq2_xxs * x = (const block_iq2_xxs *) vx; - - const auto tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * q2 = x[i].qs + 4*ib; - const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]); - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f; - const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -} - -template -static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const auto i = blockIdx.x; - const block_iq2_xs * x = (const block_iq2_xs *) vx; - - const auto tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * q2 = x[i].qs + 4*ib; - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); - const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; - const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); - -} - -template -static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const auto i = blockIdx.x; - const block_iq2_s * x = (const block_iq2_s *) vx; - - const auto tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); - const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; - const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -} - -template -static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const auto i = blockIdx.x; - const block_iq3_xxs * x = (const block_iq3_xxs *) vx; - - const auto tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * q3 = x[i].qs + 8*ib; - const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); - const uint32_t aux32 = gas[0] | (gas[1] << 16); - const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f; - const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } -} - -template -static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const auto i = blockIdx.x; - const block_iq3_s * x = (const block_iq3_s *) vx; - - const auto tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); - const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; - const uint8_t signs = x[i].signs[4*ib + il]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } -} - -template -static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int64_t i = blockIdx.x; - const block_iq1_s * x = (const block_iq1_s *) vx; - - const int64_t tid = threadIdx.x; - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; - const float d = __half2float(x[i].d) * (2*((x[i].qh[ib] >> 12) & 7) + 1); - uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; - grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; - grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; - grid32[0] &= 0x0f0f0f0f; - for (int j = 0; j < 8; ++j) { - y[j] = d * (q[j] + delta); - } -} - -template -static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int64_t i = blockIdx.x; - const block_iq1_m * x = (const block_iq1_m *) vx; - - const int64_t tid = threadIdx.x; - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * sc = (const uint16_t *)x[i].scales; - iq1m_scale_t scale; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); - const float d = __half2float(scale.f16) * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); - const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; - uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; - grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; - grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; - grid32[0] &= 0x0f0f0f0f; - for (int j = 0; j < 8; ++j) { - y[j] = d * (q[j] + delta); - } -} - -template -static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const auto i = blockIdx.x; - const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); - - const auto tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 4*il; - const uint8_t * q4 = x[ib].qs + 4*il; - const float d = __half2float(x[ib].d); - for (int j = 0; j < 4; ++j) { - y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; - y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; - } - -} - -template -static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const auto i = blockIdx.x; - const block_iq4_xs * x = (const block_iq4_xs *)vx; - - const auto tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 4*il; - const uint8_t * q4 = x[i].qs + 16*ib + 4*il; - const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); - for (int j = 0; j < 4; ++j) { - y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; - y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; - } -} - -template -static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { - const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE); - dequantize_block<<>>(vx, y, k); -} - -template -static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_q2_K<<>>(vx, y); -} - -template -static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_q3_K<<>>(vx, y); -} - -template -static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_q4_K<<>>(vx, y); -} - -template -static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_q5_K<<>>(vx, y); -} - -template -static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_q6_K<<>>(vx, y); -} - -template -static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq2_xxs<<>>(vx, y); -} - -template -static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq2_xs<<>>(vx, y); -} - -template -static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq2_s<<>>(vx, y); -} - -template -static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq3_xxs<<>>(vx, y); -} - -template -static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq3_s<<>>(vx, y); -} - -template -static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq1_s<<>>(vx, y); -} - -template -static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq1_m<<>>(vx, y); -} - -template -static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = (k + QK_K - 1) / QK_K; - dequantize_block_iq4_nl<<>>(vx, y); -} - -template -static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = (k + QK_K - 1) / QK_K; - dequantize_block_iq4_xs<<>>(vx, y); -} - -template -static to_cuda_ggml_t ggml_get_to_cuda(int64_t type) { - switch (type) { - case 2: - return dequantize_block_cuda; - case 3: - return dequantize_block_cuda; - case 6: - return dequantize_block_cuda; - case 7: - return dequantize_block_cuda; - case 8: - return dequantize_block_cuda; - case 10: - return dequantize_row_q2_K_cuda; - case 11: - return dequantize_row_q3_K_cuda; - case 12: - return dequantize_row_q4_K_cuda; - case 13: - return dequantize_row_q5_K_cuda; - case 14: - return dequantize_row_q6_K_cuda; - case 16: - return dequantize_row_iq2_xxs_cuda; - case 17: - return dequantize_row_iq2_xs_cuda; - case 18: - return dequantize_row_iq3_xxs_cuda; - case 19: - return dequantize_row_iq1_s_cuda; - case 20: - return dequantize_row_iq4_nl_cuda; - case 21: - return dequantize_row_iq3_s_cuda; - case 22: - return dequantize_row_iq2_s_cuda; - case 23: - return dequantize_row_iq4_xs_cuda; - case 29: - return dequantize_row_iq1_m_cuda; - default: - return nullptr; - } -} diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h deleted file mode 100644 index 6bef5db3ccf1..000000000000 --- a/csrc/quantization/gguf/ggml-common.h +++ /dev/null @@ -1,1150 +0,0 @@ -// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h -#define QK_K 256 -#define K_QUANTS_PER_ITERATION 2 -#define WARP_SIZE_GGUF 32 -#define K_SCALE_SIZE 12 -#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#define CUDA_QUANTIZE_BLOCK_SIZE 256 -#define GGML_CUDA_DMMV_X 32 -#define GGML_CUDA_MMV_Y 1 - - -// Data Structures -// QK = number of values after dequantization -// QR = QK / number of values before dequantization -// QI = number of 32 bit integers before dequantization - -#define QK4_0 32 -#define QR4_0 2 -#define QI4_0 (QK4_0 / (4 * QR4_0)) -typedef struct { - half d; // delta - uint8_t qs[QK4_0 / 2]; // nibbles / quants -} block_q4_0; - -#define QK4_1 32 -#define QR4_1 2 -#define QI4_1 (QK4_1 / (4 * QR4_1)) -typedef struct { - half2 dm; // dm.x = delta, dm.y = min - uint8_t qs[QK4_1 / 2]; // nibbles / quants -} block_q4_1; - -#define QK5_0 32 -#define QR5_0 2 -#define QI5_0 (QK5_0 / (4 * QR5_0)) -typedef struct { - half d; // delta - uint8_t qh[4]; // 5-th bit of quants - uint8_t qs[QK5_0 / 2]; // nibbles / quants -} block_q5_0; - -#define QK5_1 32 -#define QR5_1 2 -#define QI5_1 (QK5_1 / (4 * QR5_1)) -typedef struct { - half2 dm; // dm.x = delta, dm.y = min - uint8_t qh[4]; // 5-th bit of quants - uint8_t qs[QK5_1 / 2]; // nibbles / quants -} block_q5_1; - -#define QK8_0 32 -#define QR8_0 1 -#define QI8_0 (QK8_0 / (4 * QR8_0)) -typedef struct { - half d; // delta - int8_t qs[QK8_0]; // quants -} block_q8_0; - -#define QK8_1 32 -#define QR8_1 1 -#define QI8_1 (QK8_1 / (4 * QR8_1)) -typedef struct { - half2 ds; // ds.x = delta, ds.y = sum - int8_t qs[QK8_0]; // quants -} block_q8_1; - -#define QR2_K 4 -#define QI2_K (QK_K / (4*QR2_K)) -typedef struct { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - half2 dm; // super-block scale for quantized scales/mins -} block_q2_K; - -#define QR3_K 4 -#define QI3_K (QK_K / (4*QR3_K)) -typedef struct { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits - half d; // super-block scale -} block_q3_K; - -#define QR4_K 2 -#define QI4_K (QK_K / (4*QR4_K)) -typedef struct { - half2 dm; // super-block scale for quantized scales/mins - uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_K; - -#define QR5_K 2 -#define QI5_K (QK_K / (4*QR5_K)) -typedef struct { - half2 dm; // super-block scale for quantized scales/mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_K; - -#define QR6_K 2 -#define QI6_K (QK_K / (4*QR6_K)) -typedef struct { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales - half d; // delta -} block_q6_K; - -#define QR2_XXS 8 -#define QI2_XXS (QK_K / (4*QR2_XXS)) -typedef struct { - half d; - uint16_t qs[QK_K/8]; -} block_iq2_xxs; - -#define QR2_XS 8 -#define QI2_XS (QK_K / (4*QR2_XS)) -typedef struct { - half d; - uint16_t qs[QK_K/8]; - uint8_t scales[QK_K/32]; -} block_iq2_xs; - -#define QR2_S 8 -#define QI2_S (QK_K / (4*QR2_S)) -typedef struct { - half d; - uint8_t qs[QK_K/4]; - uint8_t qh[QK_K/32]; - uint8_t scales[QK_K/32]; -} block_iq2_s; - -#define QR3_XXS 8 -#define QI3_XXS (QK_K / (4*QR3_XXS)) -typedef struct { - half d; - uint8_t qs[3*(QK_K/8)]; -} block_iq3_xxs; - -#define QR3_XS 8 -#define QI3_XS (QK_K / (4*QR3_XS)) -#define IQ3S_N_SCALE QK_K/64 -typedef struct { - half d; - uint8_t qs[QK_K/4]; - uint8_t qh[QK_K/32]; - uint8_t signs[QK_K/8]; - uint8_t scales[IQ3S_N_SCALE]; -} block_iq3_s; - -// 1.5625 bpw -#define QR1_S 8 -#define QI1_S (QK_K / (4*QR1_S)) -typedef struct { - half d; - uint8_t qs[QK_K/8]; - uint16_t qh[QK_K/32]; -} block_iq1_s; - -// 1.75 bpw -#define QR1_M 8 -#define QI1_M (QK_K / (4*QR1_M)) -typedef struct { - uint8_t qs[QK_K/8]; // grid index, low 8 bits - uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8) - uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64) -} block_iq1_m; - -// Used by IQ1_M quants -typedef union { - half f16; - uint16_t u16; -} iq1m_scale_t; - -#define QK4_NL 32 -#define QR4_NL 2 -#define QI4_NL (QK4_NL / (4*QR4_NL)) -typedef struct { - half d; - uint8_t qs[QK4_NL/2]; -} block_iq4_nl; - -#define QR4_XS 8 -#define QI4_XS (QK_K / (4*QR4_XS)) -typedef struct { - half d; - uint16_t scales_h; - uint8_t scales_l[QK_K/64]; - uint8_t qs[QK_K/2]; -} block_iq4_xs; - -static const __device__ uint64_t iq2xxs_grid[256] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, - 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, - 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819, - 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b, - 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, - 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08, - 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b, - 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, - 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08, - 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, - 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, - 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808, - 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808, - 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, - 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819, - 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08, - 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, - 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819, - 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808, - 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, - 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908, - 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808, - 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, - 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819, - 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819, - 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908, - 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19, - 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, - 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b, - 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808, - 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, - 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08, - 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08, - 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, - 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819, - 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808, - 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, - 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19, - 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819, - 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, - 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b, - 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08, - 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, - 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908, - 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b, - 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, - 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08, - 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08, - 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, - 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b, - 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b, - 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, - 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819, - 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808, - 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, - 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b, - 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808, - 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, - 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b, - 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808, - 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, - 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, -}; - -static const __device__ uint64_t iq2xs_grid[512] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, - 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, - 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, - 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, - 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, - 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, - 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, - 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, - 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, - 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, - 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, - 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, - 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, - 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, - 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, - 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, - 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, - 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, - 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, - 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, - 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, - 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, - 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, - 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, - 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, - 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, - 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, - 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, - 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, - 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, - 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, - 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, - 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, - 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, - 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, - 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, - 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, - 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, - 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, - 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, - 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, - 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, - 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, - 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, - 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, - 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, - 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, - 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, - 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, - 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, - 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, - 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, - 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, - 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, - 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, - 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, - 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, - 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, - 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, - 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, - 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, - 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, - 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, - 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, - 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, - 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, - 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, - 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, - 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, - 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, - 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, - 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, - 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, - 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, - 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, - 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, - 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, - 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, - 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, - 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, - 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, - 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, - 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, - 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, - 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, - 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, - 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, - 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, - 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, - 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, - 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, - 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, - 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, - 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, - 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, - 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, - 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, - 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, - 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, - 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, - 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, - 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, - 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, - 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, - 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, - 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, - 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, - 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, - 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, - 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, - 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, - 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, - 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, - 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, - 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, - 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, - 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, - 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, - 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, - 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, - 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, - 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, - 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, -}; - -static const __device__ uint64_t iq2s_grid[1024] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, - 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, - 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, - 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b, - 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919, - 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808, - 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908, - 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b, - 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908, - 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08, - 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19, - 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819, - 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919, - 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b, - 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, - 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908, - 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919, - 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908, - 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b, - 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919, - 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b, - 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, - 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908, - 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b, - 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b, - 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08, - 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, - 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819, - 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808, - 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908, - 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b, - 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908, - 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08, - 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819, - 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808, - 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08, - 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819, - 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908, - 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919, - 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b, - 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919, - 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808, - 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819, - 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919, - 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919, - 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808, - 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819, - 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b, - 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908, - 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, - 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, - 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919, - 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b, - 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919, - 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b, - 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819, - 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919, - 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908, - 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b, - 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908, - 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b, - 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908, - 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08, - 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908, - 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819, - 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819, - 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808, - 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08, - 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19, - 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819, - 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808, - 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819, - 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919, - 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808, - 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19, - 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08, - 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b, - 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908, - 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808, - 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819, - 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908, - 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819, - 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808, - 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808, - 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819, - 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908, - 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08, - 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819, - 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b, - 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08, - 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19, - 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819, - 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919, - 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908, - 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808, - 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808, - 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908, - 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808, - 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08, - 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08, - 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908, - 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919, - 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808, - 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819, - 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908, - 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08, - 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819, - 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808, - 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808, - 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819, - 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808, - 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908, - 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b, - 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, - 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, - 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b, - 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808, - 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b, - 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19, - 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819, - 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08, - 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b, - 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908, - 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b, - 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b, - 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919, - 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808, - 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819, - 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908, - 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08, - 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08, - 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819, - 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919, - 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908, - 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b, - 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908, - 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b, - 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908, - 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08, - 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819, - 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808, - 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819, - 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919, - 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808, - 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808, - 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08, - 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819, - 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919, - 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808, - 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819, - 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919, - 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808, - 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b, - 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908, - 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808, - 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908, - 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b, - 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908, - 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b, - 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908, - 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b, - 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908, - 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08, - 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908, - 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b, - 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908, - 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08, - 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819, - 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919, - 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808, - 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19, - 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b, - 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919, - 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808, - 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819, - 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908, - 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919, - 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808, - 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808, - 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b, - 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919, - 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808, - 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b, - 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808, - 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919, - 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b, - 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08, - 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919, - 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808, - 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b, - 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908, - 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808, - 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808, - 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808, - 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908, - 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808, - 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808, - 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b, - 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908, - 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808, - 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808, - 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819, - 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b, - 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808, - 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819, - 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b, - 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908, - 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08, - 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908, - 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919, - 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819, - 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908, - 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808, - 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819, - 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908, - 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919, - 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808, - 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808, - 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808, - 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919, - 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908, - 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908, - 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08, - 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819, - 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b, - 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808, - 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819, - 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908, - 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819, - 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808, - 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808, - 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b, - 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908, - 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808, - 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908, - 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819, - 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819, - 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808, - 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b, - 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b, - 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819, - 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b, - 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b, - 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b, - 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819, - 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19, - 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819, - 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908, - 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808, - 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b, -}; - -static const __device__ uint32_t iq3xxs_grid[256] = { - 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, - 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, - 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, - 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, - 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, - 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, - 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, - 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, - 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, - 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, - 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, - 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, - 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, - 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, - 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, - 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, - 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, - 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, - 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, - 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, - 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, - 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, - 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, - 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, - 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, - 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, - 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, - 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, - 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, - 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, - 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, - 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, -}; - -static const __device__ uint32_t iq3xs_grid[512] = { - 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, - 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, - 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, - 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, - 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, - 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, - 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, - 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, - 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, - 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, - 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, - 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, - 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, - 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, - 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, - 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, - 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, - 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, - 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, - 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, - 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, - 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, - 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, - 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, - 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, - 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, - 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, - 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, - 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, - 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, - 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, - 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, - 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, - 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, - 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, - 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, - 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, - 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, - 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, - 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, - 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, - 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, - 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, - 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, - 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, - 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, - 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, - 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, - 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, - 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, - 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, - 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, - 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, - 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, - 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, - 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, - 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, - 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, - 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, - 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, - 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, - 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, - 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, - 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, -}; - -#define IQ1S_DELTA 0.125f -#define IQ1M_DELTA 0.125f -static const __device__ uint64_t iq1s_grid_gpu[2048] = { - 0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000, - 0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101, - 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200, - 0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212, - 0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011, - 0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111, - 0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220, - 0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022, - 0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220, - 0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101, - 0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110, - 0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111, - 0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010, - 0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210, - 0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221, - 0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021, - 0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002, - 0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101, - 0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101, - 0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211, - 0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110, - 0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022, - 0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121, - 0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220, - 0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001, - 0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101, - 0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102, - 0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012, - 0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010, - 0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111, - 0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122, - 0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222, - 0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001, - 0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102, - 0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101, - 0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000, - 0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101, - 0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112, - 0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110, - 0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211, - 0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012, - 0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111, - 0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120, - 0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122, - 0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121, - 0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221, - 0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001, - 0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101, - 0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101, - 0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011, - 0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111, - 0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011, - 0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122, - 0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121, - 0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222, - 0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101, - 0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000, - 0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200, - 0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110, - 0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112, - 0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222, - 0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021, - 0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121, - 0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201, - 0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200, - 0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101, - 0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011, - 0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010, - 0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211, - 0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121, - 0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000, - 0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202, - 0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202, - 0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211, - 0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112, - 0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020, - 0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121, - 0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222, - 0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102, - 0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100, - 0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110, - 0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011, - 0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111, - 0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110, - 0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121, - 0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222, - 0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201, - 0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102, - 0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201, - 0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012, - 0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010, - 0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010, - 0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110, - 0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011, - 0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212, - 0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021, - 0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021, - 0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021, - 0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101, - 0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101, - 0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100, - 0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010, - 0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111, - 0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010, - 0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111, - 0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120, - 0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120, - 0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101, - 0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001, - 0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201, - 0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210, - 0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211, - 0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111, - 0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112, - 0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211, - 0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010, - 0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021, - 0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122, - 0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221, - 0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102, - 0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100, - 0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101, - 0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101, - 0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101, - 0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012, - 0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110, - 0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112, - 0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210, - 0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210, - 0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210, - 0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010, - 0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110, - 0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122, - 0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020, - 0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021, - 0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022, - 0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120, - 0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222, - 0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221, - 0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001, - 0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102, - 0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201, - 0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012, - 0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111, - 0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012, - 0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110, - 0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110, - 0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121, - 0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221, - 0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220, - 0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222, - 0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000, - 0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201, - 0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012, - 0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011, - 0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212, - 0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221, - 0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121, - 0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202, - 0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202, - 0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002, - 0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101, - 0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210, - 0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112, - 0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011, - 0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011, - 0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210, - 0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020, - 0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220, - 0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222, - 0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222, - 0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001, - 0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010, - 0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111, - 0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010, - 0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110, - 0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221, - 0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122, - 0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202, - 0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100, - 0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101, - 0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112, - 0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111, - 0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211, - 0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222, - 0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221, - 0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022, - 0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101, - 0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211, - 0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111, - 0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111, - 0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010, - 0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121, - 0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222, - 0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000, - 0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202, - 0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000, - 0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202, - 0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110, - 0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110, - 0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222, - 0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120, - 0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022, - 0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101, - 0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202, - 0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110, - 0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110, - 0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111, - 0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111, - 0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120, - 0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121, - 0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001, - 0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202, - 0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001, - 0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200, - 0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011, - 0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212, - 0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012, - 0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110, - 0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012, - 0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111, - 0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020, - 0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121, - 0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222, - 0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102, - 0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102, - 0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101, - 0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212, - 0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210, - 0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111, - 0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212, - 0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221, - 0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121, - 0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002, - 0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000, - 0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202, - 0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112, - 0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111, - 0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020, - 0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221, - 0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022, - 0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100, - 0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201, - 0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112, - 0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211, - 0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012, - 0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121, - 0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020, - 0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120, - 0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200, - 0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200, - 0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110, - 0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011, - 0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222, - 0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020, - 0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222, -}; - -static const __device__ uint8_t ksigns_iq2xs[128] = { - 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, - 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, - 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, - 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, - 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, - 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, - 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, - 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, -}; - -static const __device__ uint64_t ksigns64[128] = { - 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, - 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, - 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff, - 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff, - 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff, - 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff, - 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff, - 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff, - 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff, - 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff, - 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff, - 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff, - 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff, - 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff, - 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff, - 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff, - 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff, - 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff, - 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff, - 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff, - 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff, - 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff, - 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff, - 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff, - 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff, - 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff, - 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff, - 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff, - 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff, - 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff, - 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff, - 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff, -}; - -static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; -static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - - -typedef half dfloat; // dequantize float -typedef half2 dfloat2; -typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); -template -using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream); -typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); -typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc); -typedef void (*load_tiles_cuda_t)( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row); -typedef float (*vec_dot_q_mul_mat_cuda_t)( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k); - -// Utility function - -template -static __device__ __forceinline__ dst_t convert_from_half(half val) { - return val; -} - -template<> -__device__ __forceinline__ c10::BFloat16 convert_from_half(half val) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - return __float2bfloat16(__half2float(val)); -#else - return __half2float(val); -#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -} - -template<> -__device__ __forceinline__ float convert_from_half(half val) { - return __half2float(val); -} - -#if defined(USE_ROCM) - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4(const int a, const int b) { - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); - if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); - c[i] = tmp; - } - return reinterpret_cast(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} - -static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { -#if __has_builtin(__builtin_amdgcn_sdot4) - c = __builtin_amdgcn_sdot4(a, b, c, false); -#else - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); - c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; -#endif - return c; -} - -static __device__ __forceinline__ uint32_t __vcmpeq4(const uint32_t a, const uint32_t b) { - uint32_t neq = a^b; - return !(neq & 0xff000000) * 0xff000000 | - !(neq & 0x00ff0000) * 0x00ff0000 | - !(neq & 0x0000ff00) * 0x0000ff00 | - !(neq & 0x000000ff) * 0x000000ff; -} - -static __device__ __forceinline__ uint32_t __vsub4(const uint32_t a, const uint32_t b) { - return (static_cast(((a & 0xff000000) >> 24) - ((b & 0xff000000) >> 24)) << 24) + - (static_cast(((a & 0x00ff0000) >> 16) - ((b & 0x00ff0000) >> 16)) << 16) + - (static_cast(((a & 0x0000ff00) >> 8) - ((b & 0x0000ff00) >> 8)) << 8) + - (static_cast(((a & 0x000000ff) >> 0) - ((b & 0x000000ff) >> 0)) << 0); -} -#endif // defined(USE_ROCM) diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu deleted file mode 100644 index 76fe73e95040..000000000000 --- a/csrc/quantization/gguf/gguf_kernel.cu +++ /dev/null @@ -1,542 +0,0 @@ -#include -#include - -#include -#include - -#include "../../cuda_compat.h" -#include "dispatch_utils.h" - -#include "ggml-common.h" -#include "vecdotq.cuh" -#include "dequantize.cuh" -#include "mmvq.cuh" -#include "mmq.cuh" -#include "moe.cuh" -#include "moe_vec.cuh" - -// Q8 gemv -template -static __global__ void quantize_q8_1(const scalar_t* __restrict__ x, - void* __restrict__ vy, const int kx, - const int kx_padded) { - const auto ix = blockDim.x * blockIdx.x + threadIdx.x; - if (ix >= kx_padded) { - return; - } - const auto iy = blockDim.y * blockIdx.y + threadIdx.y; - const int i_padded = iy * kx_padded + ix; - - block_q8_1* y = (block_q8_1*)vy; - - const int ib = i_padded / QK8_1; // block index - const int iqs = i_padded % QK8_1; // quant index - - const float xi = ix < kx ? static_cast(x[iy * kx + ix]) : 0.0f; - float amax = fabsf(xi); - float sum = xi; - -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32)); - sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32); - } - - const float d = amax / 127; - const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); - - y[ib].qs[iqs] = q; - - if (iqs > 0) { - return; - } - - y[ib].ds.x = __float2half(d); - y[ib].ds.y = __float2half(sum); -} - -template -static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx, - const int ky, cudaStream_t stream) { - const int64_t kx_padded = (kx + 512 - 1) / 512 * 512; - const int block_num_x = - (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - constexpr int MAX_BLOCK_SIZE = 65535; - for (int off = 0; off < ky; off += MAX_BLOCK_SIZE) { - const int num_blocks_y = std::min(ky, off + MAX_BLOCK_SIZE) - off; - const dim3 num_blocks(block_num_x, num_blocks_y, 1); - const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>( - &x[off * kx], (int32_t*)vy + off * (kx_padded / 32 * 9), kx, kx_padded); - } -} - -torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight - int64_t type, int64_t m, int64_t n, - std::optional const& dtype) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(W)); - auto dtype_ = dtype.value_or(torch::kFloat16); - auto options = torch::TensorOptions().dtype(dtype_).device(W.device()); - at::Tensor DW = torch::empty({m, n}, options); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - - VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] { - auto to_cuda = ggml_get_to_cuda(type); - to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream); - }); - - return DW; -} - -torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, // quant weight - torch::Tensor X, // input - int64_t type, int64_t row) { - int col = X.sizes()[1]; - int vecs = X.sizes()[0]; - const int padded = (col + 512 - 1) / 512 * 512; - const at::cuda::OptionalCUDAGuard device_guard(device_of(X)); - auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device()); - at::Tensor Y = torch::empty({vecs, row}, options); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - options = torch::TensorOptions().dtype(torch::kInt32).device(W.device()); - at::Tensor quant_X = torch::empty({vecs, padded / 32 * 9}, options); - VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_vec_a8", [&] { - quantize_row_q8_1_cuda( - (scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), col, vecs, stream); - switch (type) { - case 2: - mul_mat_vec_q4_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 3: - mul_mat_vec_q4_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 6: - mul_mat_vec_q5_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 7: - mul_mat_vec_q5_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 8: - mul_mat_vec_q8_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 10: - mul_mat_vec_q2_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 11: - mul_mat_vec_q3_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 12: - mul_mat_vec_q4_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 13: - mul_mat_vec_q5_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 14: - mul_mat_vec_q6_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 16: - mul_mat_vec_iq2_xxs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 17: - mul_mat_vec_iq2_xs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 18: - mul_mat_vec_iq3_xxs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 19: - mul_mat_vec_iq1_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 20: - mul_mat_vec_iq4_nl_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 21: - mul_mat_vec_iq3_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 22: - mul_mat_vec_iq2_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 23: - mul_mat_vec_iq4_xs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 29: - mul_mat_vec_iq1_m_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - } - }); - return Y; -} - -torch::Tensor ggml_mul_mat_a8(torch::Tensor W, // quant weight - torch::Tensor X, // input - int64_t type, int64_t row) { - int col = X.sizes()[1]; - int padded = (col + 512 - 1) / 512 * 512; - int batch = X.sizes()[0]; - const at::cuda::OptionalCUDAGuard device_guard(device_of(X)); - auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device()); - at::Tensor Y = torch::empty({batch, row}, options); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - options = torch::TensorOptions().dtype(torch::kInt32).device(W.device()); - at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options); - VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_a8", [&] { - quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), - col, batch, stream); - - switch (type) { - case 2: - ggml_mul_mat_q4_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 3: - ggml_mul_mat_q4_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 6: - ggml_mul_mat_q5_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 7: - ggml_mul_mat_q5_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 8: - ggml_mul_mat_q8_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 10: - ggml_mul_mat_q2_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 11: - ggml_mul_mat_q3_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 12: - ggml_mul_mat_q4_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 13: - ggml_mul_mat_q5_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 14: - ggml_mul_mat_q6_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - } - }); - return Y; -} - -torch::Tensor ggml_moe_a8(torch::Tensor X, // input - torch::Tensor W, // expert weights - torch::Tensor sorted_token_ids, - torch::Tensor expert_ids, - torch::Tensor num_tokens_post_padded, int64_t type, - int64_t row, int64_t top_k, int64_t tokens) { - int col = X.sizes()[1]; - int padded = (col + 512 - 1) / 512 * 512; - const at::cuda::OptionalCUDAGuard device_guard(device_of(X)); - auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device()); - at::Tensor Y = torch::empty({tokens * top_k, row}, options); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - options = torch::TensorOptions().dtype(torch::kInt32).device(W.device()); - at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options); - VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_a8", [&] { - quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), - col, tokens, stream); - switch (type) { - case 2: - ggml_moe_q4_0_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 3: - ggml_moe_q4_1_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 6: - ggml_moe_q5_0_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 7: - ggml_moe_q5_1_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 8: - ggml_moe_q8_0_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 10: - ggml_moe_q2_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 11: - ggml_moe_q3_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 12: - ggml_moe_q4_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 13: - ggml_moe_q5_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 14: - ggml_moe_q6_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - } - }); - return Y; -} - -torch::Tensor ggml_moe_a8_vec(torch::Tensor X, // input - torch::Tensor W, // expert weights - torch::Tensor topk_ids, int64_t top_k, - int64_t type, int64_t row, int64_t tokens) { - int col = X.sizes()[1]; - const int padded = (col + 512 - 1) / 512 * 512; - const at::cuda::OptionalCUDAGuard device_guard(device_of(X)); - auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device()); - at::Tensor Y = torch::zeros({tokens * top_k, row}, options); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - options = torch::TensorOptions().dtype(torch::kInt32).device(W.device()); - at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options); - VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_vec_a8", [&] { - quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), - (void*)quant_X.data_ptr(), col, tokens, - stream); - switch (type) { - case 2: - moe_vec_q4_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 3: - moe_vec_q4_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 6: - moe_vec_q5_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 7: - moe_vec_q5_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 8: - moe_vec_q8_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 10: - moe_vec_q2_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 11: - moe_vec_q3_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 12: - moe_vec_q4_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 13: - moe_vec_q5_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 14: - moe_vec_q6_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 16: - moe_vec_iq2_xxs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 17: - moe_vec_iq2_xs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 18: - moe_vec_iq3_xxs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 19: - moe_vec_iq1_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 20: - moe_vec_iq4_nl_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 21: - moe_vec_iq3_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 22: - moe_vec_iq2_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 23: - moe_vec_iq4_xs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 29: - moe_vec_iq1_m_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - } - }); - return Y; -} - -int64_t ggml_moe_get_block_size(int64_t type) { - switch (type) { - case 2: - return MOE_X_Q4_0; - case 3: - return MOE_X_Q4_1; - case 6: - return MOE_X_Q5_0; - case 7: - return MOE_X_Q5_1; - case 8: - return MOE_X_Q8_0; - case 10: - return MOE_X_Q2_K; - case 11: - return MOE_X_Q3_K; - case 12: - return MOE_X_Q4_K; - case 13: - return MOE_X_Q5_K; - case 14: - return MOE_X_Q6_K; - } - return 0; -} diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh deleted file mode 100644 index 7c89918c23d8..000000000000 --- a/csrc/quantization/gguf/mmq.cuh +++ /dev/null @@ -1,610 +0,0 @@ -// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu -template -static __device__ __forceinline__ void mul_mat_q( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; - const int blocks_per_warp = WARP_SIZE_GGUF / qi; - - const int & ncols_dst = ncols_y; - - const auto row_dst_0 = blockIdx.x*mmq_y; - const int & row_x_0 = row_dst_0; - - const auto col_dst_0 = blockIdx.y*mmq_x; - const int & col_y_0 = col_dst_0; - - int * tile_x_ql = nullptr; - half2 * tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - - allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - - __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF]; - __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF/QI8_1]; - - float sum[mmq_y/WARP_SIZE_GGUF][mmq_x/nwarps] = {{0.0f}}; - - for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { - - load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, - threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x); - -#pragma unroll - for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) { - const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x; - const int kbxd = kqs / QI8_1; - -#pragma unroll - for (int i = 0; i < mmq_x; i += nwarps) { - const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses - const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - const int index_y = (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF; - tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); - } - -#pragma unroll - for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { - const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x; - const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1); - const int col_y_eff = min(col_y_0 + ids, ncols_y-1); - - // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE_GGUF/QI8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE_GGUF/QI8_1) + kby]; - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float * dfi_dst = (float *) dsi_dst; - *dfi_dst = __low2float(*dsi_src); - } - } - - __syncthreads(); - -// #pragma unroll // unrolling this loop causes too much register pressure - for (int k = ir*WARP_SIZE_GGUF/qr; k < (ir+1)*WARP_SIZE_GGUF/qr; k += vdr) { -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { - sum[i/WARP_SIZE_GGUF][j/nwarps] += vec_dot( - tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, - threadIdx.x + i, threadIdx.y + j, k); - } - } - } - __syncthreads(); - } - } - -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { - const auto col_dst = col_dst_0 + j + threadIdx.y; - if (col_dst >= ncols_dst) { - return; - } - -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { - const auto row_dst = row_dst_0 + threadIdx.x + i; - if (row_dst >= nrows_dst) { - continue; - } - dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE_GGUF][j/nwarps]; - } - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q4_0 64 -#define MMQ_Y_Q4_0 128 -#define NWARPS_Q4_0 8 -#else -#define MMQ_X_Q4_0 4 -#define MMQ_Y_Q4_0 32 -#define NWARPS_Q4_0 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2) -#endif -mul_mat_q4_0( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q4_0; - const int mmq_y = MMQ_Y_Q4_0; - const int nwarps = NWARPS_Q4_0; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q4_0_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int mmq_x = MMQ_X_Q4_0; - int mmq_y = MMQ_Y_Q4_0; - int nwarps = NWARPS_Q4_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q4_1 64 -#define MMQ_Y_Q4_1 128 -#define NWARPS_Q4_1 8 -#else -#define MMQ_X_Q4_1 4 -#define MMQ_Y_Q4_1 32 -#define NWARPS_Q4_1 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2) -#endif -mul_mat_q4_1( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q4_1; - const int mmq_y = MMQ_Y_Q4_1; - const int nwarps = NWARPS_Q4_1; - - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q4_1_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int mmq_x = MMQ_X_Q4_1; - int mmq_y = MMQ_Y_Q4_1; - int nwarps = NWARPS_Q4_1; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q5_0 64 -#define MMQ_Y_Q5_0 128 -#define NWARPS_Q5_0 8 -#else -#define MMQ_X_Q5_0 4 -#define MMQ_Y_Q5_0 32 -#define NWARPS_Q5_0 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2) -#endif -mul_mat_q5_0( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q5_0; - const int mmq_y = MMQ_Y_Q5_0; - const int nwarps = NWARPS_Q5_0; - - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q5_0_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - const int mmq_x = MMQ_X_Q5_0; - const int mmq_y = MMQ_Y_Q5_0; - const int nwarps = NWARPS_Q5_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q5_1 64 -#define MMQ_Y_Q5_1 128 -#define NWARPS_Q5_1 8 -#else -#define MMQ_X_Q5_1 4 -#define MMQ_Y_Q5_1 32 -#define NWARPS_Q5_1 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2) -#endif -mul_mat_q5_1( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q5_1; - const int mmq_y = MMQ_Y_Q5_1; - const int nwarps = NWARPS_Q5_1; - - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q5_1_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int mmq_x = MMQ_X_Q5_1; - const int mmq_y = MMQ_Y_Q5_1; - const int nwarps = NWARPS_Q5_1; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q8_0 64 -#define MMQ_Y_Q8_0 128 -#define NWARPS_Q8_0 8 -#else -#define MMQ_X_Q8_0 4 -#define MMQ_Y_Q8_0 32 -#define NWARPS_Q8_0 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2) -#endif -mul_mat_q8_0( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q8_0; - const int mmq_y = MMQ_Y_Q8_0; - const int nwarps = NWARPS_Q8_0; - - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q8_0_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int mmq_x = MMQ_X_Q8_0; - const int mmq_y = MMQ_Y_Q8_0; - const int nwarps = NWARPS_Q8_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q8_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q8_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q2_K 64 -#define MMQ_Y_Q2_K 128 -#define NWARPS_Q2_K 8 -#else -#define MMQ_X_Q2_K 4 -#define MMQ_Y_Q2_K 32 -#define NWARPS_Q2_K 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2) -#endif -mul_mat_q2_K( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q2_K; - const int mmq_y = MMQ_Y_Q2_K; - const int nwarps = NWARPS_Q2_K; - - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q2_K_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int mmq_x = MMQ_X_Q2_K; - const int mmq_y = MMQ_Y_Q2_K; - const int nwarps = NWARPS_Q2_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q2_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q2_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q3_K 64 -#define MMQ_Y_Q3_K 128 -#define NWARPS_Q3_K 8 -#else -#define MMQ_X_Q3_K 4 -#define MMQ_Y_Q3_K 32 -#define NWARPS_Q3_K 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2) -#endif -mul_mat_q3_K( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - - const int mmq_x = MMQ_X_Q3_K; - const int mmq_y = MMQ_Y_Q3_K; - const int nwarps = NWARPS_Q3_K; - - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q3_K_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - const int mmq_x = MMQ_X_Q3_K; - const int mmq_y = MMQ_Y_Q3_K; - const int nwarps = NWARPS_Q3_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q3_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q3_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q4_K 64 -#define MMQ_Y_Q4_K 128 -#define NWARPS_Q4_K 8 -#else -#define MMQ_X_Q4_K 4 -#define MMQ_Y_Q4_K 32 -#define NWARPS_Q4_K 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2) -#endif -mul_mat_q4_K( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q4_K; - const int mmq_y = MMQ_Y_Q4_K; - const int nwarps = NWARPS_Q4_K; - - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q4_K_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int mmq_x = MMQ_X_Q4_K; - const int mmq_y = MMQ_Y_Q4_K; - const int nwarps = NWARPS_Q4_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q5_K 64 -#define MMQ_Y_Q5_K 128 -#define NWARPS_Q5_K 8 -#else -#define MMQ_X_Q5_K 4 -#define MMQ_Y_Q5_K 32 -#define NWARPS_Q5_K 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2) -#endif -mul_mat_q5_K( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q5_K; - const int mmq_y = MMQ_Y_Q5_K; - const int nwarps = NWARPS_Q5_K; - - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q5_K_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - const int mmq_x = MMQ_X_Q5_K; - const int mmq_y = MMQ_Y_Q5_K; - const int nwarps = NWARPS_Q5_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -#if defined(USE_ROCM) -#define MMQ_X_Q6_K 64 -#define MMQ_Y_Q6_K 128 -#define NWARPS_Q6_K 8 -#else -#define MMQ_X_Q6_K 4 -#define MMQ_Y_Q6_K 32 -#define NWARPS_Q6_K 4 -#endif - -template static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2) -#endif -mul_mat_q6_K( - const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q6_K; - const int mmq_y = MMQ_Y_Q6_K; - const int nwarps = NWARPS_Q6_K; - - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -template -static void ggml_mul_mat_q6_K_q8_1_cuda( - const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int mmq_x = MMQ_X_Q6_K; - const int mmq_y = MMQ_Y_Q6_K; - const int nwarps = NWARPS_Q6_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q6_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q6_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh deleted file mode 100644 index e27bec7af5b7..000000000000 --- a/csrc/quantization/gguf/mmvq.cuh +++ /dev/null @@ -1,212 +0,0 @@ -// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu -template -static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows, const int nvecs) { - const auto row = blockIdx.x*blockDim.y + threadIdx.y; - const auto vec = blockIdx.y; - - if (row >= nrows || vec >= nvecs) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - const int nrows_y = (ncols + 512 - 1) / 512 * 512; - - - // partial sum for each thread - float tmp = 0.0f; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index - - const int iby = vec*(nrows_y/QK8_1) + i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) { - tmp += VLLM_SHFL_XOR_SYNC(tmp, mask); - } - - if (threadIdx.x == 0) { - dst[vec*nrows + row] = tmp; - } -} - -template -static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} - -template -static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, nvecs, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows, nvecs); -} diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh deleted file mode 100644 index df9b84abcc13..000000000000 --- a/csrc/quantization/gguf/moe.cuh +++ /dev/null @@ -1,739 +0,0 @@ -#include - -/* Adapted from ./csrc/quantization/gguf/mmq.cuh - based on ./vllm/model_executor/layers/fused_moe/fused_moe.py */ -template -static __device__ __forceinline__ void moe_q( - const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* __restrict__ sorted_token_ids, - const int* __restrict__ expert_ids, - const int* __restrict__ num_tokens_post_padded, const int exp_stride, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, - const int nrows_dst, const int top_k) { - const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; - const int blocks_per_warp = WARP_SIZE_GGUF / qi; - - const int ncols_dst = ncols_y * top_k; - - const auto row_dst_0 = blockIdx.x * mmq_y; - const int& row_x_0 = row_dst_0; - - const auto col_dst_0 = blockIdx.y * mmq_x; - - int token_offs[mmq_x / nwarps]; - for (int i = 0; i < mmq_x; i += nwarps) { - token_offs[i / nwarps] = sorted_token_ids[col_dst_0 + threadIdx.y + i]; - } - - const int exp_idx = expert_ids[blockIdx.y]; - if (exp_idx > 255 || exp_idx < 0) return; - if (blockIdx.y * mmq_x > num_tokens_post_padded[0]) return; - - const block_q_t* x = (const block_q_t*)((char*)vx + exp_idx * exp_stride); - const block_q8_1* y = (const block_q8_1*)(vy); - - int* tile_x_ql = nullptr; - half2* tile_x_dm = nullptr; - int* tile_x_qh = nullptr; - int* tile_x_sc = nullptr; - - allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - - __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF]; - __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF / QI8_1]; - - float sum[mmq_y / WARP_SIZE_GGUF][mmq_x / nwarps] = {{0.0f}}; - - for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { - load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, - tile_x_qh, tile_x_sc, threadIdx.y, nrows_x - row_x_0 - 1, - threadIdx.x, blocks_per_row_x); - - const int n_per_r = ((qk * blocks_per_warp) / qr); -#pragma unroll - for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) { - const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x; - const int kbxd = kqs / QI8_1; - -#pragma unroll - for (int i = 0; i < mmq_x; i += nwarps) { - const int col_y_eff = token_offs[i / nwarps] / top_k; - const int block_x = ib0 * (qk / QK8_1) + kbxd; - if (col_y_eff < ncols_y && block_x < blocks_per_col_y) { - const block_q8_1* by0 = &y[col_y_eff * blocks_per_col_y + block_x]; - const int index_y = - (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF; - tile_y_qs[index_y] = - get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); - } - } - - if (threadIdx.x < n_per_r / QK8_1) { - const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1); - const int col_y_eff = token_offs[threadIdx.y] / top_k; - const int block_x = - ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby; - - if (col_y_eff < ncols_y && block_x < blocks_per_col_y) { - const half2* dsi_src = &y[col_y_eff * blocks_per_col_y + block_x].ds; - half2* dsi_dst = - &tile_y_ds[threadIdx.y * (WARP_SIZE_GGUF / QI8_1) + kby]; - - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float* dfi_dst = (float*)dsi_dst; - *dfi_dst = __low2float(*dsi_src); - } - } - } - __syncthreads(); - - // #pragma unroll // unrolling this loop causes too much register pressure - for (int k = ir * WARP_SIZE_GGUF / qr; k < (ir + 1) * WARP_SIZE_GGUF / qr; - k += vdr) { -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { - sum[i / WARP_SIZE_GGUF][j / nwarps] += - vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, - tile_y_ds, threadIdx.x + i, threadIdx.y + j, k); - } - } - } - __syncthreads(); - } - } - -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { - const int col_dst = token_offs[j / nwarps]; - if (col_dst >= ncols_dst) { - return; - } - -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { - const auto row_dst = row_dst_0 + threadIdx.x + i; - if (row_dst >= nrows_dst) { - continue; - } - dst[col_dst * nrows_dst + row_dst] = sum[i / WARP_SIZE_GGUF][j / nwarps]; - } - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q4_0 8 - #define MOE_Y_Q4_0 128 - #define NWARPS_Q4_0 8 -#else - #define MOE_X_Q4_0 4 - #define MOE_Y_Q4_0 32 - #define NWARPS_Q4_0 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2) -#endif - moe_q4_0(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q4_0; - const int mmq_y = MOE_Y_Q4_0; - const int nwarps = NWARPS_Q4_0; - - moe_q, load_tiles_q4_0, - VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q4_0_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - int mmq_x = MOE_X_Q4_0; - int mmq_y = MOE_Y_Q4_0; - int nwarps = NWARPS_Q4_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q4_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q4_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q4_1 8 - #define MOE_Y_Q4_1 128 - #define NWARPS_Q4_1 8 -#else - #define MOE_X_Q4_1 4 - #define MOE_Y_Q4_1 32 - #define NWARPS_Q4_1 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2) -#endif - moe_q4_1(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q4_1; - const int mmq_y = MOE_Y_Q4_1; - const int nwarps = NWARPS_Q4_1; - - moe_q, load_tiles_q4_1, - VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q4_1_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - int mmq_x = MOE_X_Q4_1; - int mmq_y = MOE_Y_Q4_1; - int nwarps = NWARPS_Q4_1; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q4_1<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q4_1<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q5_0 8 - #define MOE_Y_Q5_0 128 - #define NWARPS_Q5_0 8 -#else - #define MOE_X_Q5_0 4 - #define MOE_Y_Q5_0 32 - #define NWARPS_Q5_0 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2) -#endif - moe_q5_0(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q5_0; - const int mmq_y = MOE_Y_Q5_0; - const int nwarps = NWARPS_Q5_0; - - moe_q, load_tiles_q5_0, - VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q5_0_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q5_0; - const int mmq_y = MOE_Y_Q5_0; - const int nwarps = NWARPS_Q5_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q5_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q5_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q5_1 8 - #define MOE_Y_Q5_1 128 - #define NWARPS_Q5_1 8 -#else - #define MOE_X_Q5_1 4 - #define MOE_Y_Q5_1 32 - #define NWARPS_Q5_1 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2) -#endif - moe_q5_1(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q5_1; - const int mmq_y = MOE_Y_Q5_1; - const int nwarps = NWARPS_Q5_1; - - moe_q, load_tiles_q5_1, - VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q5_1_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q5_1; - const int mmq_y = MOE_Y_Q5_1; - const int nwarps = NWARPS_Q5_1; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q5_1<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q5_1<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q8_0 8 - #define MOE_Y_Q8_0 128 - #define NWARPS_Q8_0 8 -#else - #define MOE_X_Q8_0 4 - #define MOE_Y_Q8_0 32 - #define NWARPS_Q8_0 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2) -#endif - moe_q8_0(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q8_0; - const int mmq_y = MOE_Y_Q8_0; - const int nwarps = NWARPS_Q8_0; - - moe_q, load_tiles_q8_0, - VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q8_0_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q8_0; - const int mmq_y = MOE_Y_Q8_0; - const int nwarps = NWARPS_Q8_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q8_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q8_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q2_K 8 - #define MOE_Y_Q2_K 128 - #define NWARPS_Q2_K 8 -#else - #define MOE_X_Q2_K 4 - #define MOE_Y_Q2_K 32 - #define NWARPS_Q2_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2) -#endif - moe_q2_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q2_K; - const int mmq_y = MOE_Y_Q2_K; - const int nwarps = NWARPS_Q2_K; - - moe_q, load_tiles_q2_K, - VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q2_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q2_K; - const int mmq_y = MOE_Y_Q2_K; - const int nwarps = NWARPS_Q2_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q2_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q2_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q3_K 8 - #define MOE_Y_Q3_K 128 - #define NWARPS_Q3_K 8 -#else - #define MOE_X_Q3_K 4 - #define MOE_Y_Q3_K 32 - #define NWARPS_Q3_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2) -#endif - moe_q3_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - - const int mmq_x = MOE_X_Q3_K; - const int mmq_y = MOE_Y_Q3_K; - const int nwarps = NWARPS_Q3_K; - - moe_q, load_tiles_q3_K, - VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} -template -static void ggml_moe_q3_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q3_K; - const int mmq_y = MOE_Y_Q3_K; - const int nwarps = NWARPS_Q3_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q3_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q3_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q4_K 8 - #define MOE_Y_Q4_K 128 - #define NWARPS_Q4_K 8 -#else - #define MOE_X_Q4_K 4 - #define MOE_Y_Q4_K 32 - #define NWARPS_Q4_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2) -#endif - moe_q4_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q4_K; - const int mmq_y = MOE_Y_Q4_K; - const int nwarps = NWARPS_Q4_K; - - moe_q, load_tiles_q4_K, - VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q4_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q4_K; - const int mmq_y = MOE_Y_Q4_K; - const int nwarps = NWARPS_Q4_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q4_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q4_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q5_K 8 - #define MOE_Y_Q5_K 128 - #define NWARPS_Q5_K 8 -#else - #define MOE_X_Q5_K 4 - #define MOE_Y_Q5_K 32 - #define NWARPS_Q5_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2) -#endif - moe_q5_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q5_K; - const int mmq_y = MOE_Y_Q5_K; - const int nwarps = NWARPS_Q5_K; - - moe_q, load_tiles_q5_K, - VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q5_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q5_K; - const int mmq_y = MOE_Y_Q5_K; - const int nwarps = NWARPS_Q5_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q5_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q5_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q6_K 8 - #define MOE_Y_Q6_K 128 - #define NWARPS_Q6_K 8 -#else - #define MOE_X_Q6_K 4 - #define MOE_Y_Q6_K 32 - #define NWARPS_Q6_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2) -#endif - moe_q6_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q6_K; - const int mmq_y = MOE_Y_Q6_K; - const int nwarps = NWARPS_Q6_K; - - moe_q, load_tiles_q6_K, - VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q6_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q6_K; - const int mmq_y = MOE_Y_Q6_K; - const int nwarps = NWARPS_Q6_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q6_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q6_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} diff --git a/csrc/quantization/gguf/moe_vec.cuh b/csrc/quantization/gguf/moe_vec.cuh deleted file mode 100644 index 60f65a1bfdcb..000000000000 --- a/csrc/quantization/gguf/moe_vec.cuh +++ /dev/null @@ -1,338 +0,0 @@ -// copied and adapted from -// https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu -template -static __global__ void moe_vec_q(const void* __restrict__ vx, - const void* __restrict__ vy, - scalar_t* __restrict__ dst, - const int* topk_ids, const int topk, - const int ncols, const int nrows, - const int token_stride) { - const auto row = blockIdx.x * blockDim.y + threadIdx.y; - - const auto token = blockIdx.z / topk; - const auto expert = (topk_ids)[blockIdx.z]; - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - - // partial sum for each thread - float tmp = 0.0f; - - const block_q_t* x = ((const block_q_t*)vx) + expert * nrows * blocks_per_row; - const block_q8_1* y = - (const block_q8_1*)(((const int*)vy) + token * token_stride); - - for (auto i = threadIdx.x / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row * blocks_per_row + i; // x block index - - const int iby = i * (qk / QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (threadIdx.x % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += VLLM_SHFL_XOR_SYNC(tmp, mask); - } - - if (threadIdx.x == 0) { - dst[blockIdx.z * nrows + row] = tmp; - } -} - -template -static void moe_vec_q4_0_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q4_1_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q5_0_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q5_1_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q8_0_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q2_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q3_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q4_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q5_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q6_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_iq2_xxs_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq2_xs_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq2_s_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq3_xxs_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq1_s_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq1_m_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq4_nl_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_iq4_xs_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq3_s_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} diff --git a/csrc/quantization/gguf/vecdotq.cuh b/csrc/quantization/gguf/vecdotq.cuh deleted file mode 100644 index d0d4c74ed379..000000000000 --- a/csrc/quantization/gguf/vecdotq.cuh +++ /dev/null @@ -1,1812 +0,0 @@ -// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/vecdotq.cuh -// and https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu -static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) { - const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment - - int x32 = x16[2*i32 + 0] << 0; - x32 |= x16[2*i32 + 1] << 16; - - return x32; -} - -static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) { - return ((const int *) x)[i32]; // assume at least 4 byte alignment -} - -static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { - const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment - int x32 = 0; - x32 |= x16[0] << 0; - x32 |= x16[1] << 16; - return x32; -} - -static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) { - const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment - int x32 = 0; - x32 |= x16[0] << 0; - x32 |= x16[1] << 16; - return x32; -} - -static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) { - return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment -} - -static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) { - return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment -} - -// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called -// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q - -#define VDR_Q4_0_Q8_1_MMVQ 2 -#define VDR_Q4_0_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( - const int * v, const int * u, const float & d4, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; - const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; - - // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); - } - - const float2 ds8f = __half22float2(ds8); - - // second part effectively subtracts 8 from each quant value - return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); -#endif -} - -#define VDR_Q4_1_Q8_1_MMVQ 2 -#define VDR_Q4_1_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( - const int * v, const int * u, const half2 & dm4, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; - const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; - - // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); - } - - const float2 tmp = __half22float2(__hmul2(dm4, ds8)); - const float d4d8 = tmp.x; - const float m4s8 = tmp.y; - - // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it - return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); -#endif -} - -#define VDR_Q5_0_Q8_1_MMVQ 2 -#define VDR_Q5_0_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( - const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits - vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 - vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 - vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 - vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values - - int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits - vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 - vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 - vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 - vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values - } - - const float2 ds8f = __half22float2(ds8); - - // second part effectively subtracts 16 from each quant value - return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); -#endif -} - - -#define VDR_Q5_1_Q8_1_MMVQ 2 -#define VDR_Q5_1_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( - const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits - vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 - vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 - vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 - vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values - - int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits - vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 - vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 - vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 - vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values - } - - const float2 tmp = __half22float2(__hmul2(dm5, ds8)); - const float d5d8 = tmp.x; - const float m5s8 = tmp.y; - - // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it - return sumi*d5d8 + m5s8 / (QI5_1 / vdr); -#endif -} - -#define VDR_Q8_0_Q8_1_MMVQ 2 -#define VDR_Q8_0_Q8_1_MMQ 8 - -template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( - const int * v, const int * u, const float & d8_0, const float & d8_1) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); - } - return d8_0*d8_1 * sumi; -#endif -} - -template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( - const int * v, const int * u, const half2 & dm8, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); - } - - const float2 tmp = __half22float2(__hmul2(dm8, ds8)); - const float d8d8 = tmp.x; - const float m8s8 = tmp.y; - - // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it - return sumi*d8d8 + m8s8 / (QI8_1 / vdr); -#endif -} - -#define VDR_Q2_K_Q8_1_MMVQ 1 -#define VDR_Q2_K_Q8_1_MMQ 2 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( - const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, - const half2 & dm2, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR2_K; ++i) { - const int sc = scales[2*i]; - - const int vi = (v >> (2*i)) & 0x03030303; - - sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product - - // fill int with 4x m - int m = sc >> 4; - m |= m << 8; - m |= m << 16; - sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values - } - - const float2 dm2f = __half22float2(dm2); - - return dm2f.x*sumf_d - dm2f.y*sumf_m; -#endif -} - -static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales, - const half2 & dm2, const float & d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - int sumi_d = 0; - int sumi_m = 0; - -#pragma unroll - for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { - int sumi_d_sc = 0; - - const int sc = scales[i0 / (QI8_1/2)]; - - // fill int with 4x m - int m = sc >> 4; - m |= m << 8; - m |= m << 16; - -#pragma unroll - for (int i = i0; i < i0 + QI8_1/2; ++i) { - sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product - sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m - } - - sumi_d += sumi_d_sc * (sc & 0xF); - } - - const float2 dm2f = __half22float2(dm2); - - return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); -#endif -} - -#define VDR_Q3_K_Q8_1_MMVQ 1 -#define VDR_Q3_K_Q8_1_MMQ 2 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( - const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, - const int & scale_offset, const float & d3, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - - float sumf = 0.0f; - -#pragma unroll - for (int i = 0; i < QR3_K; ++i) { - const int isc = scale_offset + 2*i; - - const int isc_low = isc % (QK_K/32); - const int sc_shift_low = 4 * (isc / (QK_K/32)); - const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; - - const int isc_high = isc % (QK_K/64); - const int sc_shift_high = 2 * (isc / (QK_K/64)); - const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; - - const int sc = (sc_low | sc_high) - 32; - - const int vil = (vl >> (2*i)) & 0x03030303; - - const int vih = ((vh >> i) << 2) & 0x04040404; - - const int vi = __vsubss4(vil, vih); - - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d3 * sumf; -#endif -} - -static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, - const float & d3, const float & d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - int sumi = 0; - -#pragma unroll - for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { - int sumi_sc = 0; - - for (int i = i0; i < i0 + QI8_1/2; ++i) { - sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product - } - - sumi += sumi_sc * scales[i0 / (QI8_1/2)]; - } - - return d3*d8 * sumi; -#endif -} - -#define VDR_Q4_K_Q8_1_MMVQ 2 -#define VDR_Q4_K_Q8_1_MMQ 8 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR4_K; ++i) { - const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; - const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; - - const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u - - sumf_d += d8[i] * (dot1 * sc[i]); - sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values - } - - const float2 dm4f = __half22float2(dm4); - return dm4f.x*sumf_d - dm4f.y*sumf_m; -#endif -} - -static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { - int sumi_d = 0; - -#pragma unroll - for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product - } - - const float2 ds8f = __half22float2(ds8[i]); - - sumf_d += ds8f.x * (sc[i] * sumi_d); - sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val - } - - const float2 dm4f = __half22float2(dm4); - - return dm4f.x*sumf_d - dm4f.y*sumf_m; -#endif -} - -#define VDR_Q5_K_Q8_1_MMVQ 2 -#define VDR_Q5_K_Q8_1_MMQ 8 - -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( - const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR5_K; ++i) { - const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; - const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; - - const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; - const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; - - const int v0i = vl0i | vh0i; - const int v1i = vl1i | vh1i; - - const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u - - sumf_d += d8[i] * (dot1 * sc[i]); - sumf_m += d8[i] * (dot2 * m[i]); - } - - const float2 dm5f = __half22float2(dm5); - return dm5f.x*sumf_d - dm5f.y*sumf_m; -#endif -} - -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { - int sumi_d = 0; - -#pragma unroll - for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product - } - - const float2 ds8f = __half22float2(ds8[i]); - - sumf_d += ds8f.x * (sc[i] * sumi_d); - sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val - } - - const float2 dm4f = __half22float2(dm4); - - return dm4f.x*sumf_d - dm4f.y*sumf_m; -#endif -} - -#define VDR_Q6_K_Q8_1_MMVQ 1 -#define VDR_Q6_K_Q8_1_MMQ 8 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( - const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, - const float & d, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - float sumf = 0.0f; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - const int sc = scales[4*i]; - const int vil = (vl >> (4*i)) & 0x0F0F0F0F; - const int vih = ((vh >> (4*i)) << 4) & 0x30303030; - const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 - - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d*sumf; -#endif -} - -static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, - const float & d6, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - float sumf_d = 0.0f; - -#pragma unroll - for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { - int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale - -#pragma unroll - for (int i = i0; i < i0 + 2; ++i) { - sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product - sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product - - sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product - sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product - } - - sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); - } - - return d6 * sumf_d; -#endif -} - -static __device__ __forceinline__ float vec_dot_q4_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; - - int v[VDR_Q4_0_Q8_1_MMVQ]; - int u[2*VDR_Q4_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); - } - - return vec_dot_q4_0_q8_1_impl(v, u, __half2float(bq4_0->d), bq8_1->ds); -} - -template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI4_0) + mmq_y/QI4_0]; - *x_ql = tile_x_qs; - *x_dm = (half2 *) tile_x_d; -} - -template static __device__ __forceinline__ void load_tiles_q4_0( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI4_0; - const int kqsx = k % QI4_0; - - const block_q4_0 * bx0 = (const block_q4_0 *) vx; - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - if (need_check) { - i = min(i, i_max); - } - const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - // x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbx] = bxi->d; - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_0; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { - int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; - if (need_check) { - i = min(i, i_max); - } - const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d); - } -} - -static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; (void)x_sc; - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const float * x_dmf = (const float *) x_dm; - - int u[2*VDR_Q4_0_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; - u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_0) % WARP_SIZE_GGUF]; - } - - return vec_dot_q4_0_q8_1_impl - (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); -} - -static __device__ __forceinline__ float vec_dot_q4_1_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; - - int v[VDR_Q4_1_Q8_1_MMVQ]; - int u[2*VDR_Q4_1_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); - } - - return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); -} - -template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_1) + mmq_y/QI4_1]; - *x_ql = tile_x_qs; - *x_dm = tile_x_dm; -} - -template static __device__ __forceinline__ void load_tiles_q4_1( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI4_1; - const int kqsx = k % QI4_1; - - const block_q4_1 * bx0 = (const block_q4_1 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - if (need_check) { - i = min(i, i_max); - } - const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_1; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { - int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; - if (need_check) { - i = min(i, i_max); - } - const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; - } -} - -static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - - int u[2*VDR_Q4_1_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; - u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_1) % WARP_SIZE_GGUF]; - } - - return vec_dot_q4_1_q8_1_impl - (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); -} - -static __device__ __forceinline__ float vec_dot_q5_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; - - int vl[VDR_Q5_0_Q8_1_MMVQ]; - int vh[VDR_Q5_0_Q8_1_MMVQ]; - int u[2*VDR_Q5_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { - vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); - vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); - } - - return vec_dot_q5_0_q8_1_impl(vl, vh, u, __half2float(bq5_0->d), bq8_1->ds); -} - -template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI5_0) + mmq_y/QI5_0]; - - *x_ql = tile_x_ql; - *x_dm = (half2 *) tile_x_d; -} - -template static __device__ __forceinline__ void load_tiles_q5_0( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI5_0; - const int kqsx = k % QI5_0; - - const block_q5_0 * bx0 = (const block_q5_0 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; - const int ql = get_int_from_uint8(bxi->qs, kqsx); - const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); - - int qs0 = (ql >> 0) & 0x0F0F0F0F; - qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 - qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 - qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 - qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 - qs0 = __vsubss4(qs0, 0x10101010); // subtract 16 - - x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0; - - int qs1 = (ql >> 4) & 0x0F0F0F0F; - qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 - qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 - qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 - qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 - qs1 = __vsubss4(qs1, 0x10101010); // subtract 16 - - x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1; - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_0; - const int kbxd = k % blocks_per_tile_x_row; - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { - int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE_GGUF/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d); - } -} - -static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE_GGUF/QI5_0) + i/QI5_0 + k/QI5_0; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - int u[2*VDR_Q5_0_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; - u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_0) % WARP_SIZE_GGUF]; - } - - return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); -} - -static __device__ __forceinline__ float vec_dot_q5_1_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; - - int vl[VDR_Q5_1_Q8_1_MMVQ]; - int vh[VDR_Q5_1_Q8_1_MMVQ]; - int u[2*VDR_Q5_1_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { - vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); - vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); - } - - return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); -} - -template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_1) + mmq_y/QI5_1]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; -} - -template static __device__ __forceinline__ void load_tiles_q5_1( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI5_1; - const int kqsx = k % QI5_1; - - const block_q5_1 * bx0 = (const block_q5_1 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; - - const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); - const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); - - int qs0 = (ql >> 0) & 0x0F0F0F0F; - qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 - qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 - qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 - qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 - - x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0; - - int qs1 = (ql >> 4) & 0x0F0F0F0F; - qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 - qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 - qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 - qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 - - x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1; - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_1; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { - int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE_GGUF/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; - } -} - -static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE_GGUF/QI5_1) + + i/QI5_1 + k/QI5_1; - - int u[2*VDR_Q5_1_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; - u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_1) % WARP_SIZE_GGUF]; - } - - return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); -} - -static __device__ __forceinline__ float vec_dot_q8_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; - - int v[VDR_Q8_0_Q8_1_MMVQ]; - int u[VDR_Q8_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_int8(bq8_0->qs, iqs + i); - u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - } - - return vec_dot_q8_0_q8_1_impl(v, u, __half2float(bq8_0->d), __low2float(bq8_1->ds)); -} - -template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI8_0) + mmq_y/QI8_0]; - - *x_ql = tile_x_qs; - *x_dm = (half2 *) tile_x_d; -} - -template static __device__ __forceinline__ void load_tiles_q8_0( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI8_0; - const int kqsx = k % QI8_0; - float * x_dmf = (float *) x_dm; - - const block_q8_0 * bx0 = (const block_q8_0 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_int8(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI8_0; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { - int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; - - if (need_check) { - i = min(i, i_max); - } - const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d); - } -} - -static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[j * WARP_SIZE_GGUF + k], x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i/QI8_0 + k/QI8_0], - y_df[j * (WARP_SIZE_GGUF/QI8_1) + k/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q2_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q2_K * bq2_K = (const block_q2_K *) vbq; - - const int bq8_offset = QR2_K * (iqs / QI8_1); - const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); - - const uint8_t * scales = bq2_K->scales + scale_offset; - - const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); - int u[QR2_K]; - float d8[QR2_K]; - -#pragma unroll - for (int i = 0; i < QR2_K; ++ i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + i].ds); - } - - return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); -} - -template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI2_K) + mmq_y/QI2_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4) + mmq_y/4]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q2_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI2_K; - const int kqsx = k % QI2_K; - - const block_q2_K * bx0 = (const block_q2_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI2_K; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { - int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i / QI2_K + kbxd] = bxi->dm; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4); - - if (need_check) { - i = min(i, i_max); - } - const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI2_K/4); - x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); - } -} - -static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const int kbx = k / QI2_K; - const int ky = (k % QI2_K) * QR2_K; - const float * y_df = (const float *) y_ds; - - int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; - - const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); - const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); - -#pragma unroll - for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { - v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; - } - - const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4]) + ky/4; - - const int index_y = j * WARP_SIZE_GGUF + (QR2_K*k) % WARP_SIZE_GGUF; - return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q3_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q3_K * bq3_K = (const block_q3_K *) vbq; - - const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); - const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); - - const float d = __half2float(bq3_K->d); - - const int vl = get_int_from_uint8(bq3_K->qs, iqs); - - // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; - - int u[QR3_K]; - float d8[QR3_K]; - -#pragma unroll - for (int i = 0; i < QR3_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + i].ds); - } - - return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); -} - -template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI3_K) + mmq_y/QI3_K]; - __shared__ int tile_x_qh[mmq_y * (WARP_SIZE_GGUF/2) + mmq_y/2]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4) + mmq_y/4]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_qh = tile_x_qh; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q3_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI3_K; - const int kqsx = k % QI3_K; - - const block_q3_K * bx0 = (const block_q3_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - if (need_check) { - i = min(i, i_max); - } - const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI3_K; - const int kbxd = k % blocks_per_tile_x_row; - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { - int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; - if (need_check) { - i = min(i, i_max); - } - const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d); - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { - int i = i0 + i_offset * 2 + k / (WARP_SIZE_GGUF/2); - if (need_check) { - i = min(i, i_max); - } - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/2)) / (QI3_K/2); - // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - x_qh[i * (WARP_SIZE_GGUF/2) + i / 2 + k % (WARP_SIZE_GGUF/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4); - if (need_check) { - i = min(i, i_max); - } - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI3_K/4); - - const int ksc = k % (QI3_K/4); - - const int ksc_low = ksc % (QI3_K/8); - const int shift_low = 4 * (ksc / (QI3_K/8)); - const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; - - const int ksc_high = QI3_K/8; - const int shift_high = 2 * ksc; - const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; - - const int sc = __vsubss4(sc_low | sc_high, 0x20202020); - - x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = sc; - } -} - -static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - - const int kbx = k / QI3_K; - const int ky = (k % QI3_K) * QR3_K; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4)) + ky/4; - - int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { - const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); - const int shift = 2 * ((ky % 32) / 8); - const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; - - const int vh = x_qh[i * (WARP_SIZE_GGUF/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); - const int vlh = (vh << 2) & 0x04040404; - - v[l] = __vsubss4(vll, vlh); - } - - const int index_y = j * WARP_SIZE_GGUF + (k*QR3_K) % WARP_SIZE_GGUF; - return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q4_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - const block_q4_K * bq4_K = (const block_q4_K *) vbq; - - int v[2]; - int u[2*QR4_K]; - float d8[QR4_K]; - - // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 - const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); - - // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 - // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 - // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 - // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 - - const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); - v[0] = q4[0]; - v[1] = q4[4]; - - const uint16_t * scales = (const uint16_t *)bq4_K->scales; - uint16_t aux[2]; - const int j = bq8_offset/2; - if (j < 2) { - aux[0] = scales[j+0] & 0x3f3f; - aux[1] = scales[j+2] & 0x3f3f; - } else { - aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); - aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); - } - const uint8_t * sc = (const uint8_t *)aux; - const uint8_t * m = sc + 2; - - for (int i = 0; i < QR4_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = __low2float(bq8i->ds); - - const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); - u[2*i+0] = q8[0]; - u[2*i+1] = q8[4]; - } - - return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); -} - -template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_K) + mmq_y/QI4_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q4_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI4_K; // == 0 if QK_K == 256 - const int kqsx = k % QI4_K; // == k if QK_K == 256 - - const block_q4_K * bx0 = (const block_q4_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { - int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; - if (need_check) { - i = min(i, i_max); - } - const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i / QI4_K + kbxd] = bxi->dm; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI4_K/8); - - const int * scales = (const int *) bxi->scales; - - const int ksc = k % (WARP_SIZE_GGUF/8); - // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 - int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits - scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - - x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8; - } -} - -static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; - - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2*((k % 16) / 8); - - const int index_y = j * WARP_SIZE_GGUF + (QR4_K*k) % WARP_SIZE_GGUF; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q5_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q5_K * bq5_K = (const block_q5_K *) vbq; - - int vl[2]; - int vh[2]; - int u[2*QR5_K]; - float d8[QR5_K]; - - const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); - const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); - const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); - - vl[0] = ql[0]; - vl[1] = ql[4]; - - vh[0] = qh[0] >> bq8_offset; - vh[1] = qh[4] >> bq8_offset; - - const uint16_t * scales = (const uint16_t *)bq5_K->scales; - uint16_t aux[2]; - const int j = bq8_offset/2; - if (j < 2) { - aux[0] = scales[j+0] & 0x3f3f; - aux[1] = scales[j+2] & 0x3f3f; - } else { - aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); - aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); - } - const uint8_t * sc = (const uint8_t *)aux; - const uint8_t * m = sc + 2; - -#pragma unroll - for (int i = 0; i < QR5_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = __low2float(bq8i->ds); - - const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); - u[2*i+0] = q8[0]; - u[2*i+1] = q8[4]; - } - - return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); -} - -template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_K) + mmq_y/QI5_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q5_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI5_K; // == 0 if QK_K == 256 - const int kqsx = k % QI5_K; // == k if QK_K == 256 - - const block_q5_K * bx0 = (const block_q5_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; - const int ky = QR5_K*kqsx; - - const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); - const int ql0 = (ql >> 0) & 0x0F0F0F0F; - const int ql1 = (ql >> 4) & 0x0F0F0F0F; - - const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); - const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; - const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; - - const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; - const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); - - x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = ql0 | qh0; - x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = ql1 | qh1; - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { - int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i / QI5_K + kbxd] = bxi->dm; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI5_K/8); - - const int * scales = (const int *) bxi->scales; - - const int ksc = k % (WARP_SIZE_GGUF/8); - - // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 - int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits - scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - - x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8; - } -} - -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - - const int index_x = i * (QR5_K*WARP_SIZE_GGUF + 1) + QR5_K*k; - const int index_y = j * WARP_SIZE_GGUF + (QR5_K*k) % WARP_SIZE_GGUF; - return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q6_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q6_K * bq6_K = (const block_q6_K *) vbq; - - const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); - const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); - const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); - - const int vl = get_int_from_uint8(bq6_K->ql, iqs); - const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; - - const int8_t * scales = bq6_K->scales + scale_offset; - - int u[QR6_K]; - float d8[QR6_K]; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds); - } - - return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, __half2float(bq6_K->d), d8); -} - -template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI6_K) + mmq_y/QI6_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q6_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - const int kbx = k / QI6_K; // == 0 if QK_K == 256 - const int kqsx = k % QI6_K; // == k if QK_K == 256 - - const block_q6_K * bx0 = (const block_q6_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; - const int ky = QR6_K*kqsx; - - const int ql = get_int_from_uint8(bxi->ql, kqsx); - const int ql0 = (ql >> 0) & 0x0F0F0F0F; - const int ql1 = (ql >> 4) & 0x0F0F0F0F; - - const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); - const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; - const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; - - const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; - const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); - - x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); - x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); - } - - const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI6_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { - int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d); - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / 4; - - x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + k % (WARP_SIZE_GGUF/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); - } -} - -static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/8]); - - const int index_x = i * (QR6_K*WARP_SIZE_GGUF + 1) + QR6_K*k; - const int index_y = j * WARP_SIZE_GGUF + (QR6_K*k) % WARP_SIZE_GGUF; - return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; - - const int ib32 = iqs; - const uint16_t * q2 = bq2->qs + 4*ib32; - const uint8_t * aux8 = (const uint8_t *)q2; - const int8_t * q8 = bq8_1[ib32].qs; - uint32_t aux32 = q2[2] | (q2[3] << 16); - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); - const uint8_t signs = ksigns_iq2xs[aux32 & 127]; - for (int j = 0; j < 8; ++j) { - sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - aux32 >>= 7; - } - const float d = __half2float(bq2->d) * (0.5f + aux32) * __half2float(bq8_1[ib32].ds.x) * 0.25f; - return d * sumi; -} - -static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; - - const int ib32 = iqs; - const uint16_t * q2 = bq2->qs + 4*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - const uint8_t ls1 = bq2->scales[ib32] & 0xf; - const uint8_t ls2 = bq2->scales[ib32] >> 4; - int sumi1 = 0; - for (int l = 0; l < 2; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); - const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; - for (int j = 0; j < 8; ++j) { - sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - } - int sumi2 = 0; - for (int l = 2; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); - const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; - for (int j = 0; j < 8; ++j) { - sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - } - const float d = __half2float(bq2->d) * __half2float(bq8_1[ib32].ds.x) * 0.25f; - return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -} - -static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - const block_iq2_s * bq2 = (const block_iq2_s *) vbq; - - const int ib32 = iqs; - const int8_t * q8 = bq8_1[ib32].qs; - const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; - const uint8_t ls1 = bq2->scales[ib32] & 0xf; - const uint8_t ls2 = bq2->scales[ib32] >> 4; - int sumi1 = 0; - for (int l = 0; l < 2; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); - const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid[1] ^ signs1, signs1); - sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1); - sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1); - q8 += 8; - } - int sumi2 = 0; - for (int l = 2; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); - const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid[1] ^ signs1, signs1); - sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2); - sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2); - q8 += 8; - } - const float d = __half2float(bq2->d) * __low2float(bq8_1[ib32].ds) * 0.25f; - return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -#endif -} - -static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; - - const int ib32 = iqs; - const uint8_t * q3 = bq2->qs + 8*ib32; - const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - uint32_t aux32 = gas[0] | (gas[1] << 16); - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0]; - const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1]; - const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127)); - const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]); - const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]); - sumi = __dp4a(grid_l, *((int *)q8+0), sumi); - sumi = __dp4a(grid_h, *((int *)q8+1), sumi); - q8 += 8; - aux32 >>= 7; - } - const float d = __half2float(bq2->d) * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f; - return d * sumi; -#endif -} - -static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - const block_iq3_s * bq2 = (const block_iq3_s *) vbq; - - const int ib32 = iqs; - const uint8_t * qs = bq2->qs + 8*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); - const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); - uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid2[0] ^ signs1, signs1); - sumi = __dp4a(grid_l, *((int *)q8+0), sumi); - sumi = __dp4a(grid_h, *((int *)q8+1), sumi); - q8 += 8; - } - const float d = __half2float(bq2->d) * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f; - return d * sumi; -#endif -} - -static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - const block_iq1_s * bq1 = (const block_iq1_s *) vbq; - - const int qs_packed = get_int_b2(bq1->qs, iqs); - const uint8_t * qs = (const uint8_t *) &qs_packed; - - const int qh = bq1->qh[iqs]; - - int sumi = 0; -#pragma unroll - for (int l0 = 0; l0 < 8; l0 += 2) { - const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)]; - - const int grid0 = (grid >> 0) & 0x0F0F0F0F; - const int grid1 = (grid >> 4) & 0x0F0F0F0F; - - const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0); - const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1); - - sumi = __dp4a(grid0, u0, sumi); - sumi = __dp4a(grid1, u1, sumi); - } - - const float d1q = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1); - const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000); - const float2 ds = __half22float2(bq8_1[iqs].ds); - return d1q * (ds.x*sumi + ds.y*delta); -#endif -} - -static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - - const block_iq1_m * bq1 = (const block_iq1_m *) vbq; - - const int qs_packed = get_int_b4(bq1->qs, iqs); - const uint8_t * qs = (const uint8_t *) &qs_packed; - - int sumi[2] = {0}; - float sumf[2] = {0.0f}; -#pragma unroll - for (int l0 = 0; l0 < 8; l0 += 2) { - const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2)); - - const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)]; - - const int grid0 = (grid >> 0) & 0x0F0F0F0F; - const int grid1 = (grid >> 4) & 0x0F0F0F0F; - - const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0); - const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1); - - sumi[l0/4] = __dp4a(grid0, u0, sumi[l0/4]); - sumi[l0/4] = __dp4a(grid1, u1, sumi[l0/4]); - - const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08); - int sumy = 0; - sumy = __dp4a(u0, 0x01010101, sumy); - sumy = __dp4a(u1, 0x01010101, sumy); - sumf[l0/4] += delta*sumy; - } - - const uint16_t * sc = (const uint16_t *) bq1->scales; - - iq1m_scale_t scale; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000); - const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds); - - const int tmp = sc[iqs/2] >> (6*(iqs%2)); - const int sc0 = 2*((tmp >> 0) & 0x07) + 1; - const int sc1 = 2*((tmp >> 3) & 0x07) + 1; - return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1); -#endif -} - -static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values, - int & val1, int & val2) { - - uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; - aux32 = q4 & 0x0f0f0f0f; - uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); - uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); - val1 = v1 | (v2 << 16); - aux32 = (q4 >> 4) & 0x0f0f0f0f; - v1 = values[q8[0]] | (values[q8[1]] << 8); - v2 = values[q8[2]] | (values[q8[3]] << 8); - val2 = v1 | (v2 << 16); -} - -static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - - const block_iq4_nl * bq = (const block_iq4_nl *) vbq; - - const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; - const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; - - const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - - int v1, v2; - int sumi1 = 0, sumi2 = 0; - for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { - const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); - get_int_from_table_16(aux, values, v1, v2); - sumi1 = __dp4a(v1, q8[l+0], sumi1); - sumi2 = __dp4a(v2, q8[l+4], sumi2); - } - const float d = __half2float(bq->d) * __low2float(bq8_1->ds); - return d * (sumi1 + sumi2); -#endif -} - - -static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM - const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; - const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - - // iqs is 0...7 - const int ib32 = iqs; - const int32_t * q8 = (const int *)bq8_1[ib32].qs; - const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; - const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); - const float d = __half2float(bq4->d) * (ls - 32) * __low2float(bq8_1[ib32].ds); - int v1, v2; - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < 4; ++j) { - get_int_from_table_16(q4[j], values, v1, v2); - sumi1 = __dp4a(v1, q8[j+0], sumi1); - sumi2 = __dp4a(v2, q8[j+4], sumi2); - } - return d * (sumi1 + sumi2); -#endif -} \ No newline at end of file diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 27966e6808e9..b601760d4119 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -339,39 +339,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { #endif - // Dequantization for GGML. - ops.def( - "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? " - "dtype) -> Tensor"); - ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize); - - // mmvq kernel for GGML. - ops.def( - "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) " - "-> Tensor"); - ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8); - - // mmq kernel for GGML. - ops.def( - "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor"); - ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8); - - // moe kernel for GGML. - ops.def( - "ggml_moe_a8(Tensor X, Tensor W, " - "Tensor sorted_token_ids, Tensor expert_ids, Tensor " - "num_tokens_post_padded, " - "int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor"); - ops.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8); - - ops.def( - "ggml_moe_a8_vec(Tensor X, Tensor W, " - "Tensor topk_ids, int top_k, " - "int type, SymInt row, SymInt tokens) -> Tensor"); - ops.impl("ggml_moe_a8_vec", torch::kCUDA, &ggml_moe_a8_vec); - - ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size); - #ifndef USE_ROCM // Expert-specialization mxfp8 blockscaled grouped quantization (SM100+). ops.def( diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 0b8fc71d3f30..549eeda98675 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -9,7 +9,6 @@ The following are the supported quantization formats for vLLM: - [AutoAWQ](auto_awq.md) - [BitsAndBytes](bnb.md) -- [GGUF](gguf.md) - [GPTQModel](gptqmodel.md) - [Intel Neural Compressor](inc.md) - [INT4 W4A16](int4.md) @@ -53,7 +52,6 @@ th:not(:first-child) { | FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | | bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | | DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | -| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - ✅︎ indicates that the quantization method is supported on the specified hardware. diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md deleted file mode 100644 index 41912a506014..000000000000 --- a/docs/features/quantization/gguf.md +++ /dev/null @@ -1,87 +0,0 @@ -# GGUF - -!!! warning - Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. - -!!! warning - Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. - -To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF): - -```bash -# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B -``` - -You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: - -```bash -vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \ - --tokenizer Qwen/Qwen3-0.6B \ - --tensor-parallel-size 2 -``` - -Alternatively, you can download and use a local GGUF file: - -```bash -wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf -vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B -``` - -!!! warning - We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. - -GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path - -```bash -# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path -vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \ - --tokenizer Qwen/Qwen3-0.6B \ - --hf-config-path Qwen/Qwen3-0.6B -``` - -You can also use the GGUF model directly through the LLM entrypoint: - -??? code - - ```python - from vllm import LLM, SamplingParams - - # In this script, we demonstrate how to pass input to the chat method: - conversation = [ - { - "role": "system", - "content": "You are a helpful assistant", - }, - { - "role": "user", - "content": "Hello", - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?", - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, - ] - - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM using repo_id:quant_type format. - llm = LLM( - model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M", - tokenizer="Qwen/Qwen3-0.6B", - ) - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.chat(conversation, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - ``` diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 194db05e395e..07fbd7e4d555 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -32,7 +32,6 @@ def title(text: str) -> str: "mae": "MAE", "ner": "NER", "tpu": "TPU", - "gguf": "GGUF", "lora": "LoRA", "nccl": "NCCL", "rlhf": "RLHF", diff --git a/requirements/common.txt b/requirements/common.txt index b610fd678687..e4b7b339f95d 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -30,7 +30,6 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31 partial-json-parser # used for parsing partial JSON outputs pyzmq >= 25.0.0 msgspec -gguf >= 0.17.0 mistral_common[image] >= 1.11.0 opencv-python-headless >= 4.13.0 # required for video IO pyyaml diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt index e1efae912ee4..d58ccdef3537 100644 --- a/requirements/test/rocm.txt +++ b/requirements/test/rocm.txt @@ -322,10 +322,6 @@ genson==1.3.0 # via datamodel-code-generator geopandas==1.1.3 # via terratorch -gguf==0.18.0 - # via - # -c requirements/common.txt - # -r requirements/test/../common.txt gitdb==4.0.12 # via gitpython gitpython==3.1.46 @@ -686,7 +682,6 @@ numpy==2.2.6 # fastparquet # genai-perf # geopandas - # gguf # h5py # imagehash # imageio @@ -1119,7 +1114,6 @@ pyyaml==6.0.3 # datamodel-code-generator # datasets # genai-perf - # gguf # huggingface-hub # jsonargparse # lightning @@ -1176,7 +1170,6 @@ requests==2.32.5 # diffusers # docker # evaluate - # gguf # google-api-core # google-cloud-storage # gpt-oss @@ -1453,7 +1446,6 @@ tqdm==4.67.3 # -r requirements/test/../common.txt # datasets # evaluate - # gguf # huggingface-hub # lightly # lightning diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index ed4c92d90ff7..cc138454802b 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -39,12 +39,6 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): ] ) - # TODO: figure out why this fails. - if False and is_quant_method_supported("gguf"): # noqa: SIM223 - TEST_MODELS.append( - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {"quantization": "gguf"}) - ) - if is_quant_method_supported("gptq"): TEST_MODELS.append( ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {"quantization": "gptq"}) diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py deleted file mode 100644 index 0dc24187f2b3..000000000000 --- a/tests/kernels/quantization/test_ggml.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import gguf -import pytest -import torch - -from tests.kernels.utils import opcheck -from vllm import _custom_ops as ops # noqa: F401 - - -@pytest.mark.parametrize("quant_type", [12]) -def test_ggml_opcheck(quant_type): - block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type] - shape = [256, 1152] - qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8) - m = qweight.shape[0] - n = qweight.shape[1] // type_size * block_size - opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n, torch.float16)) - - x = torch.rand((m, 512), device="cuda", dtype=torch.float16) - opcheck(torch.ops._C.ggml_mul_mat_a8, (qweight, x, quant_type, qweight.shape[0])) - opcheck( - torch.ops._C.ggml_mul_mat_vec_a8, (qweight, x, quant_type, qweight.shape[0]) - ) - - shape = [256, 1024, 336] - qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8) - x = torch.rand((1, 1024), device="cuda", dtype=torch.float16) - sorted_token_ids = torch.arange(776, device="cuda") - expert_ids = torch.randint(0, 256, (194,), device="cuda") - num_tokens_post_padded = torch.tensor([1], dtype=torch.int64, device="cuda") - - opcheck( - torch.ops._C.ggml_moe_a8, - ( - x, - qweight, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - quant_type, - qweight.shape[0], - 1, - x.shape[0], - ), - ) - - topk_ids = torch.zeros((1, 1), device="cuda", dtype=torch.int32) - - opcheck( - torch.ops._C.ggml_moe_a8_vec, - (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]), - ) diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py deleted file mode 100644 index 912d5fee4e59..000000000000 --- a/tests/kernels/quantization/test_gguf.py +++ /dev/null @@ -1,207 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from pathlib import Path - -import pytest -import torch -from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize -from huggingface_hub import snapshot_download - -import vllm._custom_ops as ops -from vllm.model_executor.layers.fused_moe import fused_experts -from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf -from vllm.utils.torch_utils import set_random_seed - -GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") -GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample") - - -def get_gguf_sample_tensors( - hidden_size: int, quant_type: GGMLQuantizationType -) -> list[ReaderTensor]: - sample_dir = GGUF_SAMPLE - filename = f"Quant_{quant_type.name}_{hidden_size}.gguf" - sample_file = Path(sample_dir) / filename - return GGUFReader(sample_file).tensors - - -def get_gguf_MoE_tensors( - hidden_size: int, quant_type: GGMLQuantizationType -) -> list[ReaderTensor]: - sample_dir = GGUF_SAMPLE_MOE - filename = f"Quant_{quant_type.name}_{hidden_size}.gguf" - sample_file = Path(sample_dir) / filename - return GGUFReader(sample_file).tensors - - -DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] -# Hidden_size for testing, must match the sample file in HF repo, -# we have `hidden_size = 256, 1024` for test in HF repo currently. -HIDDEN_SIZES = [256, 1024] -NUM_TOKENS = [7, 2050] # Arbitrary values for testing -SEEDS = [0] -QUANT_TYPES = [ - # i-matrix - GGMLQuantizationType.IQ1_M, - GGMLQuantizationType.IQ1_S, - GGMLQuantizationType.IQ2_S, - GGMLQuantizationType.IQ2_XS, - GGMLQuantizationType.IQ3_S, - GGMLQuantizationType.IQ3_XXS, - GGMLQuantizationType.IQ4_NL, - GGMLQuantizationType.IQ4_XS, - # k-quants - GGMLQuantizationType.Q2_K, - GGMLQuantizationType.Q3_K, - GGMLQuantizationType.Q4_K, - GGMLQuantizationType.Q5_K, - GGMLQuantizationType.Q6_K, - # standard quantization - GGMLQuantizationType.Q4_0, - GGMLQuantizationType.Q5_0, - GGMLQuantizationType.Q8_0, -] - - -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("quant_type", QUANT_TYPES) -@torch.inference_mode() -def test_dequantize( - hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType -): - tensors = get_gguf_sample_tensors(hidden_size, quant_type) - for tensor in tensors: - shape_str = tensor.name.split("_")[-1] - shape = map(int, shape_str.split("x")) - - ref_output = torch.tensor( - dequantize(tensor.data, quant_type), device="cuda" - ).to(dtype) - output = ops.ggml_dequantize( - torch.tensor(tensor.data, device="cuda"), quant_type, *list(shape), dtype - ) - - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2) - - -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("quant_type", QUANT_TYPES) -@torch.inference_mode() -def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - set_random_seed(0) - - tensors = get_gguf_sample_tensors(hidden_size, quant_type) - x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") - for tensor in tensors: - weight = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to( - dtype - ) - ref_output = x @ weight.T - - qweight = torch.tensor(tensor.data, device="cuda") - output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type, qweight.shape[0]).to( - dtype - ) - - torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1) - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize( - "quant_type", - [ - # k-quants - GGMLQuantizationType.Q2_K, - GGMLQuantizationType.Q3_K, - GGMLQuantizationType.Q4_K, - GGMLQuantizationType.Q5_K, - GGMLQuantizationType.Q6_K, - # standard quants - GGMLQuantizationType.Q4_0, - GGMLQuantizationType.Q5_0, - GGMLQuantizationType.Q8_0, - ], -) -@torch.inference_mode() -def test_mmq( - num_tokens: int, - hidden_size: int, - dtype: torch.dtype, - quant_type: GGMLQuantizationType, -): - set_random_seed(0) - - tensors = get_gguf_sample_tensors(hidden_size, quant_type) - x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") - for tensor in tensors: - weight = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to( - dtype - ) - ref_output = x @ weight.T - - qweight = torch.tensor(tensor.data, device="cuda") - output = ops.ggml_mul_mat_a8(qweight, x, quant_type, qweight.shape[0]) - atols = {torch.half: 1, torch.bfloat16: 1.5, torch.float: 1.2} - # test matrix has inputs centered around 0 and lower precision from - # bfloat16 tends to accumulate and can greatly inflate rtol - # since outputs are also very close to 0 - rtols = {torch.half: 1e-1, torch.bfloat16: 1e4, torch.float: 2e1} - torch.testing.assert_close( - output, ref_output, atol=atols[dtype], rtol=rtols[dtype] - ) - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("hidden_size", [512]) -@pytest.mark.parametrize("top_k", [4, 8]) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("quant_type", QUANT_TYPES) -@torch.inference_mode() -def test_moe( - num_tokens: int, - hidden_size: int, - dtype: torch.dtype, - quant_type: GGMLQuantizationType, - top_k: int, -): - set_random_seed(0) - H, E = 1024, 256 - - x = torch.rand((num_tokens, H), dtype=dtype, device="cuda") - - topk_weights = torch.rand(num_tokens, top_k, device="cuda", dtype=dtype) - topk_ids = torch.randint( - 0, E, (num_tokens, top_k), device="cuda", dtype=torch.int32 - ) - - tensors = get_gguf_MoE_tensors(hidden_size, quant_type) - - w13 = tensors[0] - w2 = tensors[1] - - w13_dequant = torch.tensor(dequantize(w13.data, quant_type), device="cuda").to( - dtype - ) - - w2_dequant = torch.tensor(dequantize(w2.data, quant_type), device="cuda").to(dtype) - - output = _fused_moe_gguf( - x, - torch.tensor(w13.data, device="cuda"), - torch.tensor(w2.data, device="cuda"), - topk_weights, - topk_ids, - quant_type, - quant_type, - "silu", - ) - - ref_output = fused_experts( - x, w13_dequant, w2_dequant, topk_weights, topk_ids - ).reshape(output.shape) - torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1) diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py deleted file mode 100644 index 813dccf1451b..000000000000 --- a/tests/models/multimodal/generation/test_multimodal_gguf.py +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os - -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -from typing import Any, NamedTuple - -import pytest -from huggingface_hub import hf_hub_download -from pytest import MarkDecorator -from transformers import AutoModelForImageTextToText - -from tests.quantization.utils import is_quant_method_supported -from vllm.assets.image import ImageAsset -from vllm.multimodal.image import rescale_image_size -from vllm.utils.torch_utils import set_default_torch_num_threads - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner -from ...utils import check_logprobs_close - - -class GGUFMMTestConfig(NamedTuple): - original_model: str - gguf_repo: str - gguf_backbone: str - gguf_mmproj: str - prompt: list[str] - image_names: list[str] # Store names, load PIL images at runtime - max_model_len: int = 4096 - marks: list[MarkDecorator] = [] - mm_processor_kwargs: dict[str, Any] = {} - - @property - def gguf_model(self): - hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj) - return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone) - - -# Common prompts aligned with test_common.py "gemma3" entry format -_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts( - { - "stop_sign": ( - "user\n" - "What's the content in the center of the image?" - "\nmodel\n" - ), - "cherry_blossom": ( - "user\n" - "What is the season?" - "\nmodel\n" - ), - } -) - -# Image asset names - load at runtime to avoid pickle issues with subprocess -_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"] - -# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF -GEMMA3_CONFIG = GGUFMMTestConfig( - original_model="google/gemma-3-4b-it", - gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf", - gguf_backbone="gemma-3-4b-it-q4_0.gguf", - gguf_mmproj="mmproj-model-f16-4B.gguf", - prompt=_GEMMA3_PROMPTS, - image_names=_GEMMA3_IMAGE_NAMES, - max_model_len=4096, - marks=[pytest.mark.core_model], - mm_processor_kwargs={}, -) - -# Pan-and-scan multimodal - uses unquantized BF16 GGUF -GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig( - original_model="google/gemma-3-4b-it", - gguf_repo="unsloth/gemma-3-4b-it-GGUF", - gguf_backbone="gemma-3-4b-it-BF16.gguf", - gguf_mmproj="mmproj-BF16.gguf", - prompt=_GEMMA3_PROMPTS, - image_names=_GEMMA3_IMAGE_NAMES, - max_model_len=4096, - marks=[pytest.mark.core_model], - mm_processor_kwargs={"do_pan_and_scan": True}, -) - -MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN] - - -def run_multimodal_gguf_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - model: GGUFMMTestConfig, - dtype: str, - max_tokens: int, - num_logprobs: int, -): - # Load images at runtime (inside subprocess) to avoid pickle issues - images = [ImageAsset(name).pil_image for name in model.image_names] - size_factors = [0.25, 0.5, 1.0] - inputs_per_image = [ - ( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) - for image, prompt in zip(images, model.prompt) - ] - - # NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork. - # Run GGUF model via vLLM. - with ( - set_default_torch_num_threads(1), - vllm_runner( - model_name=model.gguf_model, - enforce_eager=True, - tokenizer_name=model.original_model, - dtype=dtype, - max_model_len=model.max_model_len, - mm_processor_kwargs=model.mm_processor_kwargs, - ) as gguf_model, - ): - gguf_outputs_per_case = [ - gguf_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - ) - for prompts, images in inputs_per_image - ] - - # Then run HfRunner for HuggingFace baseline comparison. - with hf_runner( - model.original_model, - dtype=dtype, - auto_cls=AutoModelForImageTextToText, - ) as hf_model: - hf_outputs_per_case = [ - hf_model.generate_greedy_logprobs_limit( - prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - ) - for prompts, images in inputs_per_image - ] - - for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=gguf_outputs, - name_0="hf", - name_1="gguf", - ) - - -@pytest.mark.skipif( - not is_quant_method_supported("gguf"), - reason="gguf is not supported on this GPU type.", -) -@pytest.mark.parametrize( - "model", - [ - pytest.param(test_config, marks=test_config.marks) - for test_config in MODELS_TO_TEST - ], -) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [10]) -def test_gemma3_mm_gguf( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - model: GGUFMMTestConfig, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - run_multimodal_gguf_test( - hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs - ) diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py deleted file mode 100644 index 064ca94f3cba..000000000000 --- a/tests/models/quantization/test_gguf.py +++ /dev/null @@ -1,204 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Tests gguf models against unquantized models generations -Note: To pass the test, quantization higher than Q4 should be used -""" - -import os -from typing import NamedTuple - -import pytest -from huggingface_hub import hf_hub_download -from pytest import MarkDecorator -from transformers import AutoTokenizer - -from tests.quantization.utils import is_quant_method_supported - -from ...conftest import VllmRunner -from ...utils import multi_gpu_test -from ..utils import check_logprobs_close - -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -MAX_MODEL_LEN = 1024 - - -class GGUFTestConfig(NamedTuple): - original_model: str - gguf_repo: str - gguf_filename: str - marks: list[MarkDecorator] = [] - - @property - def gguf_model(self): - return hf_hub_download(self.gguf_repo, filename=self.gguf_filename) - - -LLAMA_CONFIG = GGUFTestConfig( - original_model="meta-llama/Llama-3.2-1B-Instruct", - gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF", - gguf_filename="Llama-3.2-1B-Instruct-Q6_K.gguf", -) - -QWEN2_CONFIG = GGUFTestConfig( - original_model="Qwen/Qwen2.5-1.5B-Instruct", - gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF", - gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf", -) - -QWEN3_CONFIG = GGUFTestConfig( - original_model="Qwen/Qwen3-0.6B", - gguf_repo="unsloth/Qwen3-0.6B-GGUF", - gguf_filename="Qwen3-0.6B-BF16.gguf", -) - -PHI3_CONFIG = GGUFTestConfig( - original_model="microsoft/Phi-3.5-mini-instruct", - gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF", - gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf", -) - -GPT2_CONFIG = GGUFTestConfig( - original_model="openai-community/gpt2-large", - gguf_repo="QuantFactory/gpt2-large-GGUF", - gguf_filename="gpt2-large.Q4_K_M.gguf", -) - -STABLELM_CONFIG = GGUFTestConfig( - original_model="stabilityai/stablelm-3b-4e1t", - gguf_repo="afrideva/stablelm-3b-4e1t-GGUF", - gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf", -) - -STARCODER_CONFIG = GGUFTestConfig( - original_model="bigcode/starcoder2-3b", - gguf_repo="QuantFactory/starcoder2-3b-GGUF", - gguf_filename="starcoder2-3b.Q6_K.gguf", -) - -DOLPHIN_CONFIG = GGUFTestConfig( - # Test VocabParallelEmbedding sharding issue. - original_model="cognitivecomputations/TinyDolphin-2.8-1.1b", - gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF", - gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf", -) - -GEMMA3_CONFIG = GGUFTestConfig( - original_model="google/gemma-3-270m-it", - gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF", - gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf", -) - -MODELS = [ - # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458 - QWEN2_CONFIG, - QWEN3_CONFIG, - PHI3_CONFIG, - GPT2_CONFIG, - STABLELM_CONFIG, - DOLPHIN_CONFIG, - GEMMA3_CONFIG, - # STARCODER_CONFIG, # broken -] - - -def check_model_outputs( - vllm_runner: type[VllmRunner], - prompts: list[str], - model: GGUFTestConfig, - dtype: str, - max_tokens: int, - num_logprobs: int, - tp_size: int, -): - tokenizer = AutoTokenizer.from_pretrained(model.original_model) - if tokenizer.chat_template is not None: - messages = [[{"role": "user", "content": prompt}] for prompt in prompts] - prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - # Run gguf model. - with vllm_runner( - model_name=model.gguf_model, - enforce_eager=True, - tokenizer_name=model.original_model, - dtype=dtype, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tp_size, - ) as gguf_model: - gguf_outputs = gguf_model.generate_greedy_logprobs( - prompts[:-1], max_tokens, num_logprobs - ) - - # Run unquantized model. - # Should run with tp=1, otherwise the test will stuck at - # nccl initialization. - with vllm_runner( - model_name=model.original_model, - enforce_eager=True, # faster tests - dtype=dtype, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=1, - ) as original_model: - original_outputs = original_model.generate_greedy_logprobs( - prompts[:-1], max_tokens, num_logprobs - ) - - check_logprobs_close( - outputs_0_lst=original_outputs, - outputs_1_lst=gguf_outputs, - name_0="original", - name_1="gguf", - ) - - -@pytest.mark.skipif( - not is_quant_method_supported("gguf"), - reason="gguf is not supported on this GPU type.", -) -@pytest.mark.parametrize( - "model", - [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS], -) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("tp_size", [1]) -def test_models( - vllm_runner: type[VllmRunner], - example_prompts: list[str], - model: GGUFTestConfig, - dtype: str, - max_tokens: int, - num_logprobs: int, - tp_size: int, -) -> None: - check_model_outputs( - vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size - ) - - -@pytest.mark.skipif( - not is_quant_method_supported("gguf"), - reason="gguf is not supported on this GPU type.", -) -@pytest.mark.parametrize("model", [LLAMA_CONFIG]) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [8]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("tp_size", [2]) -@multi_gpu_test(num_gpus=2) -def test_distributed( - vllm_runner: type[VllmRunner], - example_prompts: list[str], - model: GGUFTestConfig, - dtype: str, - max_tokens: int, - num_logprobs: int, - tp_size: int, -) -> None: - check_model_outputs( - vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size - ) diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py deleted file mode 100644 index e9ca35afd66a..000000000000 --- a/tests/models/test_gguf_download.py +++ /dev/null @@ -1,221 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from unittest.mock import MagicMock, patch - -import pytest - -from vllm.config import ModelConfig -from vllm.config.load import LoadConfig -from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader -from vllm.model_executor.model_loader.weight_utils import download_gguf - - -class TestGGUFDownload: - """Test GGUF model downloading functionality.""" - - @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf") - def test_download_gguf_single_file(self, mock_download): - """Test downloading a single GGUF file.""" - # Setup mock - mock_folder = "/tmp/mock_cache" - mock_download.return_value = mock_folder - - # Mock glob to return a single file - with patch("glob.glob") as mock_glob: - mock_glob.side_effect = lambda pattern, **kwargs: ( - [f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else [] - ) - - result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S") - - # Verify download_weights_from_hf was called with correct patterns - mock_download.assert_called_once_with( - model_name_or_path="unsloth/Qwen3-0.6B-GGUF", - cache_dir=None, - allow_patterns=[ - "*-IQ1_S.gguf", - "*-IQ1_S-*.gguf", - "*/*-IQ1_S.gguf", - "*/*-IQ1_S-*.gguf", - ], - revision=None, - ignore_patterns=None, - ) - - # Verify result is the file path, not folder - assert result == f"{mock_folder}/model-IQ1_S.gguf" - - @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf") - def test_download_gguf_sharded_files(self, mock_download): - """Test downloading sharded GGUF files.""" - mock_folder = "/tmp/mock_cache" - mock_download.return_value = mock_folder - - # Mock glob to return sharded files - with patch("glob.glob") as mock_glob: - mock_glob.side_effect = lambda pattern, **kwargs: ( - [ - f"{mock_folder}/model-Q2_K-00001-of-00002.gguf", - f"{mock_folder}/model-Q2_K-00002-of-00002.gguf", - ] - if "Q2_K" in pattern - else [] - ) - - result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K") - - # Should return the first file after sorting - assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf" - - @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf") - def test_download_gguf_subdir(self, mock_download): - """Test downloading GGUF files from subdirectory.""" - mock_folder = "/tmp/mock_cache" - mock_download.return_value = mock_folder - - with patch("glob.glob") as mock_glob: - mock_glob.side_effect = lambda pattern, **kwargs: ( - [f"{mock_folder}/Q2_K/model-Q2_K.gguf"] - if "Q2_K" in pattern or "**/*.gguf" in pattern - else [] - ) - - result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K") - - assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf" - - @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf") - @patch("glob.glob", return_value=[]) - def test_download_gguf_no_files_found(self, mock_glob, mock_download): - """Test error when no GGUF files are found.""" - mock_folder = "/tmp/mock_cache" - mock_download.return_value = mock_folder - - with pytest.raises(ValueError, match="Downloaded GGUF files not found"): - download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S") - - -class TestGGUFModelLoader: - """Test GGUFModelLoader class methods.""" - - @patch("os.path.isfile", return_value=True) - def test_prepare_weights_local_file(self, mock_isfile): - """Test _prepare_weights with local file.""" - load_config = LoadConfig(load_format="gguf") - loader = GGUFModelLoader(load_config) - - # Create a simple mock ModelConfig with only the model attribute - model_config = MagicMock() - model_config.model = "/path/to/model.gguf" - - result = loader._prepare_weights(model_config) - assert result == "/path/to/model.gguf" - mock_isfile.assert_called_once_with("/path/to/model.gguf") - - @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download") - @patch("os.path.isfile", return_value=False) - def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download): - """Test _prepare_weights with repo_id/filename.gguf format.""" - load_config = LoadConfig(load_format="gguf") - loader = GGUFModelLoader(load_config) - - mock_hf_download.return_value = "/downloaded/model.gguf" - - # Create a simple mock ModelConfig with only the model attribute - model_config = MagicMock() - model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf" - - result = loader._prepare_weights(model_config) - assert result == "/downloaded/model.gguf" - mock_hf_download.assert_called_once_with( - repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf" - ) - - @patch("vllm.config.model.get_hf_image_processor_config", return_value=None) - @patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True) - @patch("vllm.config.model.get_config") - @patch("vllm.config.model.is_gguf", return_value=True) - @patch("vllm.model_executor.model_loader.gguf_loader.download_gguf") - @patch("os.path.isfile", return_value=False) - def test_prepare_weights_repo_quant_type( - self, - mock_isfile, - mock_download_gguf, - mock_is_gguf, - mock_get_config, - mock_file_exists, - mock_get_image_config, - ): - """Test _prepare_weights with repo_id:quant_type format.""" - mock_hf_config = MagicMock() - mock_hf_config.architectures = ["Qwen3ForCausalLM"] - - class MockTextConfig: - max_position_embeddings = 4096 - sliding_window = None - model_type = "qwen3" - num_attention_heads = 32 - - mock_text_config = MockTextConfig() - mock_hf_config.get_text_config.return_value = mock_text_config - mock_hf_config.dtype = "bfloat16" - mock_get_config.return_value = mock_hf_config - - load_config = LoadConfig(load_format="gguf") - loader = GGUFModelLoader(load_config) - - mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf" - - model_config = ModelConfig( - model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B" - ) - result = loader._prepare_weights(model_config) - # The actual result will be the downloaded file path from mock - assert result == "/downloaded/model-IQ1_S.gguf" - mock_download_gguf.assert_called_once_with( - "unsloth/Qwen3-0.6B-GGUF", - "IQ1_S", - cache_dir=None, - revision=None, - ignore_patterns=["original/**/*"], - ) - - @patch("vllm.config.model.get_hf_image_processor_config", return_value=None) - @patch("vllm.config.model.get_config") - @patch("vllm.config.model.is_gguf", return_value=False) - @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False) - @patch("os.path.isfile", return_value=False) - def test_prepare_weights_invalid_format( - self, - mock_isfile, - mock_check_gguf, - mock_is_gguf, - mock_get_config, - mock_get_image_config, - ): - """Test _prepare_weights with invalid format.""" - mock_hf_config = MagicMock() - mock_hf_config.architectures = ["Qwen3ForCausalLM"] - - class MockTextConfig: - max_position_embeddings = 4096 - sliding_window = None - model_type = "qwen3" - num_attention_heads = 32 - - mock_text_config = MockTextConfig() - mock_hf_config.get_text_config.return_value = mock_text_config - mock_hf_config.dtype = "bfloat16" - mock_get_config.return_value = mock_hf_config - - load_config = LoadConfig(load_format="gguf") - loader = GGUFModelLoader(load_config) - - # Create ModelConfig with a valid repo_id to avoid validation errors - # Then test _prepare_weights with invalid format - model_config = ModelConfig(model="unsloth/Qwen3-0.6B") - # Manually set model to invalid format after creation - model_config.model = "invalid-format" - with pytest.raises(ValueError, match="Unrecognised GGUF reference"): - loader._prepare_weights(model_config) diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py index 94dd014c929f..adcb02a9300a 100644 --- a/tests/transformers_utils/test_utils.py +++ b/tests/transformers_utils/test_utils.py @@ -1,15 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from pathlib import Path -from unittest.mock import patch - -import pytest - -from vllm.transformers_utils.gguf_utils import ( - is_gguf, - is_remote_gguf, - split_remote_gguf, -) from vllm.transformers_utils.utils import ( is_azure, is_cloud_storage, @@ -45,203 +35,3 @@ def test_is_cloud_storage(): assert is_cloud_storage("az://model-container/path") assert not is_cloud_storage("/unix/local/path") assert not is_cloud_storage("nfs://nfs-fqdn.local") - - -class TestIsRemoteGGUF: - """Test is_remote_gguf utility function.""" - - def test_is_remote_gguf_with_colon_and_slash(self): - """Test is_remote_gguf with repo_id:quant_type format.""" - # Valid quant types (exact GGML types) - assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S") - assert is_remote_gguf("user/repo:Q2_K") - assert is_remote_gguf("repo/model:Q4_K") - assert is_remote_gguf("repo/model:Q8_0") - - # Invalid quant types should return False - assert not is_remote_gguf("repo/model:quant") - assert not is_remote_gguf("repo/model:INVALID") - assert not is_remote_gguf("repo/model:invalid_type") - - def test_is_remote_gguf_extended_quant_types(self): - """Test is_remote_gguf with extended quant type naming conventions.""" - # Extended quant types with _M, _S, _L suffixes - assert is_remote_gguf("repo/model:Q4_K_M") - assert is_remote_gguf("repo/model:Q4_K_S") - assert is_remote_gguf("repo/model:Q3_K_L") - assert is_remote_gguf("repo/model:Q5_K_M") - assert is_remote_gguf("repo/model:Q3_K_S") - - # Extended quant types with _XL, _XS, _XXS suffixes - assert is_remote_gguf("repo/model:Q5_K_XL") - assert is_remote_gguf("repo/model:IQ4_XS") - assert is_remote_gguf("repo/model:IQ3_XXS") - - # Invalid extended types (base type doesn't exist) - assert not is_remote_gguf("repo/model:INVALID_M") - assert not is_remote_gguf("repo/model:Q9_K_M") - - def test_is_remote_gguf_nonstandard_quant_type(self): - """Test is_remote_gguf with non-standard quant types containing - a known GGML type.""" - # Non-standard quant types with known GGML type after prefix - assert is_remote_gguf("unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q4_K_XL") - assert is_remote_gguf("user/Model:UD-Q4_K_M") - assert is_remote_gguf("user/SomeModel:Custom-Q8_0") - - # Exact GGML type after prefix (no suffix stripping needed) - assert is_remote_gguf("user/Model-GGUF:UD-IQ4_NL") - assert is_remote_gguf("user/Model-GGUF:UD-Q8_0") - - # Completely unknown quant types should still fail - assert not is_remote_gguf("repo/model:TOTALLY-RANDOM") - assert not is_remote_gguf("user/Model:UD-INVALID") - - # No dash separator → not recognized as prefixed - assert not is_remote_gguf("repo/model:UDIQ4NL") - - def test_is_remote_gguf_without_colon(self): - """Test is_remote_gguf without colon.""" - assert not is_remote_gguf("repo/model") - assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF") - - def test_is_remote_gguf_without_slash(self): - """Test is_remote_gguf without slash.""" - assert not is_remote_gguf("model.gguf") - # Even with valid quant_type, no slash means not remote GGUF - assert not is_remote_gguf("model:IQ1_S") - assert not is_remote_gguf("model:quant") - - def test_is_remote_gguf_local_path(self): - """Test is_remote_gguf with local file path.""" - assert not is_remote_gguf("/path/to/model.gguf") - assert not is_remote_gguf("./model.gguf") - - def test_is_remote_gguf_with_path_object(self): - """Test is_remote_gguf with Path object.""" - assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S")) - assert not is_remote_gguf(Path("repo/model")) - - def test_is_remote_gguf_with_http_https(self): - """Test is_remote_gguf with HTTP/HTTPS URLs.""" - # HTTP/HTTPS URLs should return False even with valid quant_type - assert not is_remote_gguf("http://example.com/repo/model:IQ1_S") - assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K") - assert not is_remote_gguf("http://repo/model:Q4_K") - assert not is_remote_gguf("https://repo/model:Q8_0") - - def test_is_remote_gguf_with_cloud_storage(self): - """Test is_remote_gguf with cloud storage paths.""" - # Cloud storage paths should return False even with valid quant_type - assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S") - assert not is_remote_gguf("gs://bucket/repo/model:Q2_K") - assert not is_remote_gguf("s3://repo/model:Q4_K") - assert not is_remote_gguf("gs://repo/model:Q8_0") - - -class TestSplitRemoteGGUF: - """Test split_remote_gguf utility function.""" - - def test_split_remote_gguf_valid(self): - """Test split_remote_gguf with valid repo_id:quant_type format.""" - repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S") - assert repo_id == "unsloth/Qwen3-0.6B-GGUF" - assert quant_type == "IQ1_S" - - repo_id, quant_type = split_remote_gguf("repo/model:Q2_K") - assert repo_id == "repo/model" - assert quant_type == "Q2_K" - - def test_split_remote_gguf_extended_quant_types(self): - """Test split_remote_gguf with extended quant type naming conventions.""" - repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:Q4_K_M") - assert repo_id == "unsloth/Qwen3-0.6B-GGUF" - assert quant_type == "Q4_K_M" - - repo_id, quant_type = split_remote_gguf("repo/model:Q3_K_S") - assert repo_id == "repo/model" - assert quant_type == "Q3_K_S" - - def test_split_remote_gguf_nonstandard_quant_type(self): - """Test split_remote_gguf with non-standard quant types in GGUF repos.""" - repo_id, quant_type = split_remote_gguf( - "unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q4_K_XL" - ) - assert repo_id == "unsloth/Qwen3.5-35B-A3B-GGUF" - assert quant_type == "UD-Q4_K_XL" - - def test_split_remote_gguf_with_path_object(self): - """Test split_remote_gguf with Path object.""" - repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S")) - assert repo_id == "unsloth/Qwen3-0.6B-GGUF" - assert quant_type == "IQ1_S" - - def test_split_remote_gguf_invalid(self): - """Test split_remote_gguf with invalid format.""" - # Invalid format (no colon) - is_remote_gguf returns False - with pytest.raises(ValueError, match="Wrong GGUF model"): - split_remote_gguf("repo/model") - - # Invalid quant type - is_remote_gguf returns False - with pytest.raises(ValueError, match="Wrong GGUF model"): - split_remote_gguf("repo/model:INVALID_TYPE") - - # HTTP URL - is_remote_gguf returns False - with pytest.raises(ValueError, match="Wrong GGUF model"): - split_remote_gguf("http://repo/model:IQ1_S") - - # Cloud storage - is_remote_gguf returns False - with pytest.raises(ValueError, match="Wrong GGUF model"): - split_remote_gguf("s3://bucket/repo/model:Q2_K") - - -class TestIsGGUF: - """Test is_gguf utility function.""" - - @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=True) - def test_is_gguf_with_local_file(self, mock_check_gguf): - """Test is_gguf with local GGUF file.""" - assert is_gguf("/path/to/model.gguf") - assert is_gguf("./model.gguf") - - def test_is_gguf_with_remote_gguf(self): - """Test is_gguf with remote GGUF format.""" - # Valid remote GGUF format (repo_id:quant_type with valid quant_type) - assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S") - assert is_gguf("repo/model:Q2_K") - assert is_gguf("repo/model:Q4_K") - - # Extended quant types with suffixes - assert is_gguf("repo/model:Q4_K_M") - assert is_gguf("repo/model:Q3_K_S") - assert is_gguf("repo/model:Q5_K_L") - - # Invalid quant_type should return False - assert not is_gguf("repo/model:quant") - assert not is_gguf("repo/model:INVALID") - - @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False) - def test_is_gguf_false(self, mock_check_gguf): - """Test is_gguf returns False for non-GGUF models.""" - assert not is_gguf("unsloth/Qwen3-0.6B") - assert not is_gguf("repo/model") - assert not is_gguf("model") - - def test_is_gguf_edge_cases(self): - """Test is_gguf with edge cases.""" - # Empty string - assert not is_gguf("") - - # Only colon, no slash (even with valid quant_type) - assert not is_gguf("model:IQ1_S") - - # Only slash, no colon - assert not is_gguf("repo/model") - - # HTTP/HTTPS URLs - assert not is_gguf("http://repo/model:IQ1_S") - assert not is_gguf("https://repo/model:Q2_K") - - # Cloud storage - assert not is_gguf("s3://bucket/repo/model:IQ1_S") - assert not is_gguf("gs://bucket/repo/model:Q2_K") diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 01ac03f27a3f..eada640e7612 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -764,69 +764,6 @@ def _allspark_w8a16_gemm_fake( return torch.empty((m, n), device=a.device, dtype=a.dtype) -if hasattr(torch.ops._C, "ggml_dequantize"): - - @register_fake("_C::ggml_dequantize") - def _ggml_dequantize_fake( - W: torch.Tensor, - quant_type: int, - m: torch.SymInt, - n: torch.SymInt, - dtype: torch.dtype | None = None, - ) -> torch.Tensor: - return torch.empty((m, n), dtype=torch.float16, device=W.device) - - @register_fake("_C::ggml_mul_mat_vec_a8") - def _ggml_mul_mat_vec_a8_fake( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: torch.SymInt, - ) -> torch.Tensor: - return torch.empty((X.shape[0], row), dtype=X.dtype, device=W.device) - - @register_fake("_C::ggml_mul_mat_a8") - def _ggml_mul_mat_a8_fake( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: torch.SymInt, - ) -> torch.Tensor: - batch = X.size(0) - return torch.empty((batch, row), dtype=X.dtype, device=W.device) - - @register_fake("_C::ggml_moe_a8") - def _ggml_moe_a8_fake( - X: torch.Tensor, - W: torch.Tensor, - sorted_token_ids: torch.Tensor, - expert_ids: torch.Tensor, - num_tokens_post_padded: torch.Tensor, - quant_type: int, - row: torch.SymInt, - top_k: torch.SymInt, - tokens: torch.SymInt, - ) -> torch.Tensor: - tokens = X.size(0) - return torch.empty((tokens * top_k, row), dtype=torch.float16, device=W.device) - - -if hasattr(torch.ops._C, "ggml_moe_a8_vec"): - - @register_fake("_C::ggml_moe_a8_vec") - def _ggml_moe_a8_vec_fake( - X: torch.Tensor, - W: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - quant_type: int, - row: torch.SymInt, - tokens: torch.SymInt, - ) -> torch.Tensor: - tokens = X.size(0) - return torch.empty((tokens * top_k, row), dtype=X.dtype, device=W.device) - - # cutlass def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool: return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability) @@ -2041,71 +1978,6 @@ def scaled_int8_quant( return output, input_scales, input_azp -# gguf -def ggml_dequantize( - W: torch.Tensor, quant_type: int, m: int, n: int, dtype: torch.dtype | None -) -> torch.Tensor: - return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype) - - -def ggml_mul_mat_vec_a8( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: int, -) -> torch.Tensor: - return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row) - - -def ggml_mul_mat_a8( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: int, -) -> torch.Tensor: - return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row) - - -def ggml_moe_a8( - X: torch.Tensor, - W: torch.Tensor, - sorted_token_ids: torch.Tensor, - expert_ids: torch.Tensor, - num_tokens_post_padded: torch.Tensor, - quant_type: int, - row: int, - top_k: int, - tokens: int, -) -> torch.Tensor: - return torch.ops._C.ggml_moe_a8( - X, - W, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - quant_type, - row, - top_k, - tokens, - ) - - -def ggml_moe_a8_vec( - X: torch.Tensor, - W: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - quant_type: int, - row: torch.SymInt, - tokens: torch.SymInt, -) -> torch.Tensor: - return torch.ops._C.ggml_moe_a8_vec(X, W, topk_ids, top_k, quant_type, row, tokens) - - -def ggml_moe_get_block_size(quant_type: int) -> int: - return torch.ops._C.ggml_moe_get_block_size(quant_type) - - # mamba def selective_scan_fwd( u: torch.Tensor, diff --git a/vllm/config/load.py b/vllm/config/load.py index 93240ec5fc0f..e27c1ce0fd0c 100644 --- a/vllm/config/load.py +++ b/vllm/config/load.py @@ -48,8 +48,6 @@ class LoadConfig: - "bitsandbytes" will load the weights using bitsandbytes quantization. - "sharded_state" will load weights from pre-sharded checkpoint files, supporting efficient loading of tensor-parallel models. - - "gguf" will load weights from GGUF format files (details specified in - https://github.com/ggml-org/ggml/blob/master/docs/gguf.md). - "mistral" will load weights from consolidated safetensors files used by Mistral models. - Other custom values can be supported via plugins. diff --git a/vllm/config/model.py b/vllm/config/model.py index 1cce7f9d94cc..054f14a26fef 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -25,6 +25,7 @@ from vllm.config.scheduler import RunnerType from vllm.config.utils import config, getattr_iter from vllm.logger import init_logger +from vllm.model_format import get_model_format_handler from vllm.platforms import current_platform from vllm.tasks import PoolingTask, ScoreType, SupportedTask from vllm.transformers_utils.config import ( @@ -42,12 +43,6 @@ uses_mrope, uses_xdrope_dim, ) -from vllm.transformers_utils.gguf_utils import ( - is_gguf, - is_remote_gguf, - maybe_patch_hf_config_from_gguf, - split_remote_gguf, -) from vllm.transformers_utils.model_arch_config_convertor import ( MODEL_ARCH_CONFIG_CONVERTORS, ModelArchConfigConvertorBase, @@ -503,10 +498,8 @@ def __post_init__( hf_overrides_fn=hf_overrides_fn, token=self.hf_token, ) - hf_config = maybe_patch_hf_config_from_gguf( - self.model, - hf_config, - ) + if handler := get_model_format_handler(self.model): + hf_config = handler.patch_model_hf_config(self.model, hf_config) self.hf_config = hf_config if dict_overrides: @@ -666,12 +659,8 @@ def __post_init__( ) # Multimodal GGUF models must use original repo for mm processing - if is_gguf(self.tokenizer) and self.is_multimodal_model: - raise ValueError( - "Loading a multimodal GGUF model needs to use original " - "tokenizer. Please specify the unquantized hf model's " - "repo name or path using the --tokenizer argument." - ) + if handler := get_model_format_handler(self.model): + handler.validate_model_config(self) if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len @@ -826,8 +815,8 @@ def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> No def _get_encoder_config(self) -> dict[str, Any] | None: model = self.model - if is_remote_gguf(model): - model, _ = split_remote_gguf(model) + if handler := get_model_format_handler(model): + model = handler.resolve_sentence_transformer_source(model, self.revision) return get_sentence_transformer_tokenizer_config(model, self.revision) def _get_default_runner_type( @@ -952,7 +941,6 @@ def _verify_quantization(self) -> None: # imports during override detection (e.g., MXFP4 imports Triton) "mxfp4", "cpu_awq", - "gguf", ] quantization_methods = [ q for q in supported_quantization if q not in overrides diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c9b90848ff04..b42904367883 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -93,6 +93,7 @@ from vllm.config.utils import get_field from vllm.config.vllm import OptimizationLevel, PerformanceMode from vllm.logger import init_logger, suppress_logging +from vllm.model_format import get_model_format_handler from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized @@ -100,7 +101,6 @@ is_interleaved, maybe_override_with_speculators, ) -from vllm.transformers_utils.gguf_utils import is_gguf from vllm.transformers_utils.repo_utils import get_model_path from vllm.transformers_utils.utils import is_cloud_storage from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -1416,9 +1416,10 @@ def from_cli_args(cls, args: argparse.Namespace): return engine_args def create_model_config(self) -> ModelConfig: - # gguf file needs a specific model loader - if is_gguf(self.model): - self.quantization = self.load_format = "gguf" + load_general_plugins() + + if handler := get_model_format_handler(self.model): + handler.update_engine_args(self) if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: logger.warning( @@ -1559,6 +1560,7 @@ def create_engine_config( NOTE: If VllmConfig is incompatible, we raise an error. """ current_platform.pre_register_and_update() + load_general_plugins() device_config = DeviceConfig(device=cast(Device, current_platform.device_type)) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c4fc1fd2557e..178aa361602d 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1113,9 +1113,11 @@ def weight_loader( # dimension intermediate_size_per_partition is used. SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type: + needs_custom_weight_materialization = getattr( + param, "needs_custom_weight_materialization", False + ) + needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) + if needs_custom_weight_type: param.weight_type = loaded_weight.item() param.data.copy_(loaded_weight) return True if return_success else None @@ -1165,8 +1167,9 @@ def weight_loader( if full_load: shard_dim += 1 - # Materialize GGUF UninitializedParameter accounting merged weights - if is_gguf_weight and isinstance(param, UninitializedParameter): + if needs_custom_weight_materialization and isinstance( + param, UninitializedParameter + ): # To materialize a tensor, we must have full shape including # number of experts, making this portion to require `full_load`. assert full_load diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 975fedabd675..297a10bd3942 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -360,15 +360,16 @@ def __init__( def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # If the weight on disk does not have a shape, give it one # (such scales for AutoFp8). - # Special case for GGUF - - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type: + needs_custom_weight_materialization = getattr( + param, "needs_custom_weight_materialization", False + ) + needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) + if needs_custom_weight_type: param.weight_type = loaded_weight.item() - # Materialize GGUF UninitializedParameter - if is_gguf_weight and isinstance(param, UninitializedParameter): + if needs_custom_weight_materialization and isinstance( + param, UninitializedParameter + ): param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) if len(loaded_weight.shape) == 0: @@ -534,14 +535,16 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # no need to narrow is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit - # Special case for GGUF - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type: + needs_custom_weight_materialization = getattr( + param, "needs_custom_weight_materialization", False + ) + needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) + if needs_custom_weight_type: param.weight_type = loaded_weight.item() - # Materialize GGUF UninitializedParameter - if is_gguf_weight and isinstance(param, UninitializedParameter): + if needs_custom_weight_materialization and isinstance( + param, UninitializedParameter + ): final_shape = list(loaded_weight.shape) if output_dim is not None: assert final_shape[output_dim] % self.tp_size == 0 @@ -692,17 +695,18 @@ def weight_loader( loaded_shard_id: tuple[int, ...] | int | None = None, ): self.validate_shard_id(loaded_shard_id) - # Special case for GGUF - # initialize GGUF param after we know the quantize type - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + needs_custom_weight_materialization = getattr( + param, "needs_custom_weight_materialization", False + ) + needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) if isinstance(loaded_shard_id, tuple) and ( - is_gguf_weight or is_gguf_weight_type + needs_custom_weight_materialization or needs_custom_weight_type ): raise NotImplementedError( - "Shard id with multiple indices is not supported for GGUF." + "Shard id with multiple indices is not supported for this " + "format-specific weight loader." ) - if is_gguf_weight_type: + if needs_custom_weight_type: if loaded_shard_id is not None: param.data[loaded_shard_id].copy_(loaded_weight) param.shard_weight_type[loaded_shard_id] = loaded_weight.item() @@ -712,7 +716,7 @@ def weight_loader( } return - if is_gguf_weight: + if needs_custom_weight_materialization: output_dim = getattr(param, "output_dim", None) shard_size = loaded_weight.size(output_dim) // self.tp_size start_idx = self.tp_rank * shard_size @@ -1168,11 +1172,11 @@ def weight_loader( loaded_shard_id: str | None = None, ): self.validate_shard_id(loaded_shard_id) - # Special case for GGUF - # initialize GGUF param after we know the quantize type - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type: + needs_custom_weight_materialization = getattr( + param, "needs_custom_weight_materialization", False + ) + needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) + if needs_custom_weight_type: idx_map = {"q": 0, "k": 1, "v": 2} if loaded_shard_id is not None: param.data[idx_map[loaded_shard_id]].copy_(loaded_weight) @@ -1181,7 +1185,7 @@ def weight_loader( param.shard_weight_type = {k: loaded_weight.item() for k in idx_map} return - if is_gguf_weight: + if needs_custom_weight_materialization: output_dim = getattr(param, "output_dim", None) shard_size = loaded_weight.size(output_dim) // self.tp_size start_idx = self.tp_rank * shard_size @@ -1480,14 +1484,16 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # no need to narrow is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit - # Special case for GGUF - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type: + needs_custom_weight_materialization = getattr( + param, "needs_custom_weight_materialization", False + ) + needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) + if needs_custom_weight_type: param.weight_type = loaded_weight.item() - # Materialize GGUF UninitializedParameter - if is_gguf_weight and isinstance(param, UninitializedParameter): + if needs_custom_weight_materialization and isinstance( + param, UninitializedParameter + ): weight_shape = list(loaded_weight.shape) if input_dim: weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 1ac0f9ee9cc5..b2fb6a528a5d 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -18,7 +18,6 @@ "modelopt_fp4", "modelopt_mxfp8", "modelopt_mixed", - "gguf", "gptq_marlin", "awq_marlin", "gptq", @@ -122,7 +121,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .fbgemm_fp8 import FBGEMMFp8Config from .fp8 import Fp8Config from .fp_quant import FPQuantConfig - from .gguf import GGUFConfig from .gptq import GPTQConfig from .gptq_marlin import GPTQMarlinConfig from .inc import INCConfig @@ -147,7 +145,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "modelopt_fp4": ModelOptNvFp4Config, "modelopt_mxfp8": ModelOptMxFp8Config, "modelopt_mixed": ModelOptMixedPrecisionConfig, - "gguf": GGUFConfig, "gptq_marlin": GPTQMarlinConfig, "awq_marlin": AWQMarlinConfig, "gptq": GPTQConfig, diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index eedc62f7d4d5..4ed906598b7d 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -120,6 +120,10 @@ def override_quantization_method( """ return None + @classmethod + def requires_hf_quant_config(cls) -> bool: + return True + @staticmethod def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any: """Get a value from the model's quantization config.""" @@ -156,6 +160,28 @@ def get_quant_method( def get_cache_scale(self, name: str) -> str | None: return None + def override_is_neox_style(self, model_type: str) -> bool | None: + return None + + def should_keep_tied_lm_head(self) -> bool: + return False + + def transform_loaded_weight( + self, + name: str, + loaded_weight: torch.Tensor, + ) -> torch.Tensor: + return loaded_weight + + def remap_loaded_parameter( + self, + name: str, + param: torch.Tensor, + loaded_weight: torch.Tensor, + params_dict: dict[str, torch.Tensor], + ) -> torch.Tensor: + return param + def apply_vllm_mapper( # noqa: B027 self, hf_to_vllm_mapper: "WeightsMapper" ): diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py deleted file mode 100644 index 2a72da26cc62..000000000000 --- a/vllm/model_executor/layers/quantization/gguf.py +++ /dev/null @@ -1,691 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Mapping -from types import MappingProxyType -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from vllm.model_executor.layers.quantization import QuantizationMethods - -import gguf -import torch -from gguf import GGMLQuantizationType as WeightType -from torch.nn.parameter import Parameter, UninitializedParameter - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.activation import ( - MoEActivation, - apply_moe_activation, -) -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, - FusedMoEQuantConfig, -) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, - FusedMoEMethodBase, -) -from vllm.model_executor.layers.linear import ( - LinearBase, - LinearMethodBase, - UnquantizedLinearMethod, -) -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, - QuantizeMethodBase, -) -from vllm.model_executor.layers.vocab_parallel_embedding import ( - UnquantizedEmbeddingMethod, - VocabParallelEmbedding, -) -from vllm.model_executor.models.utils import WeightsMapper -from vllm.model_executor.utils import set_weight_attrs -from vllm.platforms import current_platform -from vllm.utils.torch_utils import direct_register_custom_op - -logger = init_logger(__name__) - - -class GGUFConfig(QuantizationConfig): - """Config class for GGUF.""" - - def __init__(self, unquantized_modules: list[str] | None = None) -> None: - super().__init__() - self.unquantized_modules = unquantized_modules or [] - - def __repr__(self) -> str: - return "GGUFConfig()" - - def get_name(self) -> QuantizationMethods: - return "gguf" - - def get_supported_act_dtypes(self) -> list[torch.dtype]: - # GGUF dequantization kernels use half precision (fp16) internally. - # bfloat16 has precision issues on Blackwell devices. - if current_platform.has_device_capability(100): - logger.warning_once("GGUF has precision issues with bfloat16 on Blackwell.") - return [torch.half, torch.float32] - return [torch.half, torch.bfloat16, torch.float32] - - @classmethod - def get_min_capability(cls) -> int: - return 60 - - @classmethod - def get_config_filenames(cls) -> list[str]: - return [] # no extra configs. - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "GGUFConfig": - return cls() - - @classmethod - def override_quantization_method( - cls, hf_quant_cfg: dict[str, Any], user_quant: str | None - ) -> "QuantizationMethods | None": - # When user explicitly specifies --quantization gguf, override - # whatever quantization method is in the HF model config (e.g. fp8). - if user_quant == "gguf": - return "gguf" - return None - - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> "QuantizeMethodBase | None": - if isinstance(layer, LinearBase): - if is_layer_skipped_gguf( - prefix, self.unquantized_modules, self.packed_modules_mapping - ): - return UnquantizedLinearMethod() - return GGUFLinearMethod(self) - elif isinstance(layer, VocabParallelEmbedding): - if is_layer_skipped_gguf( - prefix, self.unquantized_modules, self.packed_modules_mapping - ): - return UnquantizedEmbeddingMethod() - return GGUFEmbeddingMethod(self) - elif isinstance(layer, FusedMoE): - # TODO: Select UnquantizedFusedMoEMethod on unquantized layers. - return GGUFMoEMethod(self, layer.moe_config) - return None - - def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - """ - Interface for models to update module names referenced in - quantization configs in order to reflect the vllm model structure - - :param hf_to_vllm_mapper: maps from hf model structure (the assumed - structure of the qconfig) to vllm model structure - """ - if self.unquantized_modules is not None: - self.unquantized_modules = hf_to_vllm_mapper.apply_list( - self.unquantized_modules - ) - - -def is_layer_skipped_gguf( - prefix: str, - unquantized_modules: list[str], - fused_mapping: Mapping[str, list[str]] = MappingProxyType({}), -): - # Fused layers like gate_up_proj or qkv_proj will not be fused - # in the safetensors checkpoint. So, we convert the name - # from the fused version to unfused + check to make sure that - # each shard of the fused layer has the same scheme. - proj_name = prefix.split(".")[-1] - if proj_name in fused_mapping: - shard_prefixes = [ - prefix.replace(proj_name, shard_proj_name) - for shard_proj_name in fused_mapping[proj_name] - ] - - is_skipped = None - for shard_prefix in shard_prefixes: - is_shard_skipped = any( - shard_prefix in module_name for module_name in unquantized_modules - ) - - if is_skipped is None: - is_skipped = is_shard_skipped - elif is_shard_skipped != is_skipped: - raise ValueError( - f"Detected some but not all shards of {prefix} " - "are quantized. All shards of fused layers " - "to have the same precision." - ) - else: - is_skipped = any(module_name in prefix for module_name in unquantized_modules) - - assert is_skipped is not None - return is_skipped - - -UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16} -STANDARD_QUANT_TYPES = { - WeightType.Q4_0, - WeightType.Q4_1, - WeightType.Q5_0, - WeightType.Q5_1, - WeightType.Q8_0, - WeightType.Q8_1, -} -KQUANT_TYPES = { - WeightType.Q2_K, - WeightType.Q3_K, - WeightType.Q4_K, - WeightType.Q5_K, - WeightType.Q6_K, -} -IMATRIX_QUANT_TYPES = { - WeightType.IQ1_M, - WeightType.IQ1_S, - WeightType.IQ2_XXS, - WeightType.IQ2_XS, - WeightType.IQ2_S, - WeightType.IQ3_XXS, - WeightType.IQ3_S, - WeightType.IQ4_XS, - WeightType.IQ4_NL, -} -# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization. -# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add -# MMQ kernel for I-Matrix quantization. -DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES -MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES -MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES - - -def _fused_mul_mat_gguf( - x: torch.Tensor, qweight: torch.Tensor, qweight_type: int -) -> torch.Tensor: - if qweight_type in IMATRIX_QUANT_TYPES: - mmvq_safe = 8 if qweight.shape[0] > 5120 else 16 - else: - mmvq_safe = 2 if qweight.shape[0] > 5120 else 6 - # HACK: when doing chunked prefill we don't generate output tokens - # so input to logits generator is empty which causes invalid parameter - if x.shape[0] == 0: - return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device) - # there is no need to call any kernel for fp16/bf16 - if qweight_type in UNQUANTIZED_TYPES: - return x @ qweight.T - # enable MMVQ in contiguous batching with batch_size=1 - if x.shape[0] <= mmvq_safe and qweight_type in MMVQ_QUANT_TYPES: - y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0]) - # Use MMQ Kernel if it's available (standard + k-quants) - elif qweight_type in MMQ_QUANT_TYPES: - y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0]) - # If there is no available MMQ kernel, fallback to dequantize - elif qweight_type in DEQUANT_TYPES: - block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] - shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size) - weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype) - y = x @ weight.T - else: - # Raise an error if the quantization type is not supported. - # Might be useful if llama.cpp adds a new quantization type. - # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type. - qweight_type = WeightType(qweight_type) - raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}") - return y - - -def _fused_mul_mat_gguf_fake( - x: torch.Tensor, - qweight: torch.Tensor, - qweight_type: int, -) -> torch.Tensor: - return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device) - - -try: - direct_register_custom_op( - op_name="_fused_mul_mat_gguf", - op_func=_fused_mul_mat_gguf, - fake_impl=_fused_mul_mat_gguf_fake, - ) - fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf - -except AttributeError as error: - raise error - - -def _fused_moe_gguf( - x: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - qweight_type: int, - qweight_type2: int, - activation: str, -) -> torch.Tensor: - activation_enum = MoEActivation.from_str(activation) - - def act(x: torch.Tensor): - d = x.shape[-1] // 2 - output_shape = x.shape[:-1] + (d,) - out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - apply_moe_activation(activation_enum, out, x) - return out - - # lazy import to avoid triggering triton import in CPU backend - from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size - - out_hidden_states = torch.empty_like(x) - # unless we decent expert reuse we are better off running moe_vec kernel - if ( - qweight_type2 in MMQ_QUANT_TYPES - and qweight_type in MMQ_QUANT_TYPES - and x.shape[0] > 64 - ): - num_tokens, _ = x.shape - E, N, _ = w1.shape - top_k = topk_ids.shape[1] - BLOCK_SIZE = ops.ggml_moe_get_block_size(qweight_type) - - sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( - topk_ids, BLOCK_SIZE, E - ) - out = ops.ggml_moe_a8( - x, - w1, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - qweight_type, - N, - top_k, - num_tokens, - ) - out = act(out) - out = ops.ggml_moe_a8( - out, - w2, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - qweight_type2, - w2.shape[1], - 1, - num_tokens * top_k, - ) - out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_( - topk_weights.view(num_tokens, top_k, 1) - ) - ops.moe_sum(out, out_hidden_states) - elif qweight_type2 in MMVQ_QUANT_TYPES and qweight_type in MMVQ_QUANT_TYPES: - num_tokens, _ = x.shape - E, N, _ = w1.shape - top_k = topk_ids.shape[1] - - out = ops.ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N, num_tokens) - out = act(out) - - out = ops.ggml_moe_a8_vec( - out, w2, topk_ids, 1, qweight_type2, w2.shape[1], num_tokens * top_k - ) - out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_( - topk_weights.view(num_tokens, top_k, 1) - ) - ops.moe_sum(out, out_hidden_states) - else: - logger.warning_once( - "There is no support for fast MoE kernel " - "for current quantization method. " - "Falling back to slow implementation. " - ) - for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)): - inp = x[tok].reshape((1,) + x.shape[1:]) - current_hidden_state = None - for ww, ii in zip(w, idx): - expert_up = w1[ii] - - out = fused_mul_mat_gguf(inp, expert_up, qweight_type) - out = act(out) - - expert_down = w2[ii] - current_state = fused_mul_mat_gguf( - out, expert_down, qweight_type2 - ).mul_(ww) - if current_hidden_state is None: - current_hidden_state = current_state - else: - current_hidden_state.add_(current_state) - out_hidden_states[tok] = current_hidden_state - return out_hidden_states - - -def _fused_moe_gguf_fake( - x: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - qweight_type: int, - qweight_type2: int, - activation: str, -) -> torch.Tensor: - return torch.empty_like(x) - - -try: - direct_register_custom_op( - op_name="_fused_moe_gguf", - op_func=_fused_moe_gguf, - fake_impl=_fused_moe_gguf_fake, - ) - fused_moe_gguf = torch.ops.vllm._fused_moe_gguf - -except AttributeError as error: - raise error - - -def _apply_gguf_embedding( - x: torch.Tensor, - qweight: torch.Tensor, - qweight_type: int, - hidden_size: int, - dtype: torch.dtype | None = None, -) -> torch.Tensor: - if qweight_type in UNQUANTIZED_TYPES: - return torch.embedding(qweight, x) - elif qweight_type in DEQUANT_TYPES: - block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] - x_flat = x.flatten() - assert hidden_size == qweight.shape[1] // type_size * block_size - quant = torch.index_select(qweight, dim=0, index=x_flat) - dequant = ops.ggml_dequantize( - quant, qweight_type, hidden_size, x_flat.shape[0], dtype - ) - return dequant.view(*x.shape, hidden_size) - else: - qweight_type = WeightType(qweight_type) - raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}") - - -def _apply_gguf_embedding_fake( - x: torch.Tensor, - qweight: torch.Tensor, - qweight_type: int, - hidden_size: int, - dtype: torch.dtype | None = None, -) -> torch.Tensor: - return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device) - - -try: - direct_register_custom_op( - op_name="_apply_gguf_embedding", - op_func=_apply_gguf_embedding, - fake_impl=_apply_gguf_embedding_fake, - ) - apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding - -except AttributeError as error: - raise error - - -class GGUFLinearMethod(LinearMethodBase): - """Linear method for GGUF. - - Args: - quant_config: The GGUF quantization config. - """ - - def __init__(self, quant_config: GGUFConfig): - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - self.params_dtype = params_dtype - output_size_per_partition = sum(output_partition_sizes) - - tensor_shape = (output_size_per_partition, input_size_per_partition) - qweight = GGUFUninitializedParameter(requires_grad=False) - set_weight_attrs( - qweight, - { - "input_dim": 1, - "output_dim": 0, - "tensor_shape": tensor_shape, - "is_gguf_weight": True, - "data_container": [], - "shard_id": [], - "shard_id_map": {}, - }, - ) - set_weight_attrs(qweight, extra_weight_attrs) - layer.register_parameter("qweight", qweight) - - qweight_type = Parameter( - torch.empty(len(output_partition_sizes), dtype=torch.uint8), - requires_grad=False, - ) - set_weight_attrs( - qweight_type, - { - "is_gguf_weight_type": True, - "weight_type": 0, - "shard_weight_type": {}, - "ignore_warning": True, - }, - ) - set_weight_attrs(qweight_type, extra_weight_attrs) - layer.register_parameter("qweight_type", qweight_type) - - def process_weights_after_loading(self, layer: torch.nn.Module): - qweight_type = layer.qweight_type.weight_type - if not (qweight_type in UNQUANTIZED_TYPES or qweight_type in DEQUANT_TYPES): - qweight_type = WeightType(qweight_type) - raise ValueError( - f"Unsupported GGUF quantization type {qweight_type} in layer {layer}." - ) - # For MergedColumnParallelLinear and QKVParallelLinear, we need to - # materialize the padded weight parameter for CUDA Graph compatibility. - self._create_padded_weight_param(layer) - - def _create_padded_weight_param(self, layer: torch.nn.Module): - """Create padded weight parameter for GGUF MergedLinear layer.""" - qweight = layer.qweight - shard_id_map = qweight.shard_id_map - shard_id = qweight.shard_id - if len(data_container := qweight.data_container) > 1: - dtype = {data.dtype for data in data_container} - assert len(dtype) == 1, ValueError( - f"Data container has mixed dtypes: {dtype}" - ) - dtype = next(iter(dtype)) - # concat dim0 and pad dim1 - padded_side = max(x.size(1) for x in data_container) - concat_side = sum(x.size(0) for x in data_container) - # Pad the quantized weights to dense tensor, and create a map - # with the location of each shard in the padded tensor. - padded_data = torch.zeros( - (concat_side, padded_side), dtype=dtype, device=qweight.device - ) - # (dim0_start, dim0_end, dim1_size) - shard_offset_map = dict[str, tuple[int, int, int]]() - for idx in shard_id: - id_in_container = shard_id_map[idx] - start = sum(x.size(0) for x in data_container[:id_in_container]) - end = start + data_container[id_in_container].size(0) - size = data_container[id_in_container].size(1) - padded_data[start:end, :size] = data_container[id_in_container] - shard_offset_map[idx] = (start, end, size) - qweight.data_container.clear() - padded_param = Parameter(padded_data, requires_grad=False) - set_weight_attrs(padded_param, vars(qweight)) - set_weight_attrs(padded_param, {"shard_offset_map": shard_offset_map}) - layer.register_parameter("qweight", padded_param) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: torch.Tensor | None = None, - ) -> torch.Tensor: - shard_id = layer.qweight.shard_id - - if shard_id: - # dequantize shard weights respectively - shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id - qweight = layer.qweight - result = [] - for idx in shard_id: - start, end, offset = layer.qweight.shard_offset_map[idx] - qweight_type = layer.qweight_type.shard_weight_type[idx] - result.append( - fused_mul_mat_gguf( - x, qweight[start:end, :offset].contiguous(), qweight_type - ) - ) - out = torch.cat(result, axis=1) - else: - qweight = layer.qweight - qweight_type = layer.qweight_type.weight_type - out = fused_mul_mat_gguf(x, qweight, qweight_type) - if bias is not None: - out.add_(bias) - return out - - -class GGUFMoEMethod(FusedMoEMethodBase): - """MoE method for GGUF. - - Args: - quant_config: The GGUF quantization config. - """ - - def __init__( - self, - quant_config: GGUFConfig, - moe: FusedMoEConfig, - ): - super().__init__(moe) - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - tensor_shape = (num_experts, 2 * intermediate_size_per_partition, hidden_size) - # gate up proj - w13_qweight = GGUFUninitializedParameter(requires_grad=False) - set_weight_attrs( - w13_qweight, - { - "input_dim": 1, - "output_dim": 0, - "tensor_shape": tensor_shape, - "is_gguf_weight": True, - "data_container": [], - }, - ) - set_weight_attrs(w13_qweight, extra_weight_attrs) - layer.register_parameter("w13_qweight", w13_qweight) - - w13_qweight_type = Parameter( - torch.empty(1, dtype=torch.uint8), requires_grad=False - ) - set_weight_attrs( - w13_qweight_type, - {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True}, - ) - set_weight_attrs(w13_qweight_type, extra_weight_attrs) - layer.register_parameter("w13_qweight_type", w13_qweight_type) - - tensor_shape = (num_experts, intermediate_size_per_partition, hidden_size) - # gate down proj - w2_qweight = GGUFUninitializedParameter(requires_grad=False) - set_weight_attrs( - w2_qweight, - { - "input_dim": 1, - "output_dim": 0, - "tensor_shape": tensor_shape, - "is_gguf_weight": True, - "data_container": [], - }, - ) - set_weight_attrs(w2_qweight, extra_weight_attrs) - layer.register_parameter("w2_qweight", w2_qweight) - - w2_qweight_type = Parameter( - torch.empty(1, dtype=torch.uint8), requires_grad=False - ) - set_weight_attrs( - w2_qweight_type, - {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True}, - ) - - set_weight_attrs(w2_qweight_type, extra_weight_attrs) - layer.register_parameter("w2_qweight_type", w2_qweight_type) - - def get_fused_moe_quant_config( - self, layer: torch.nn.Module - ) -> FusedMoEQuantConfig | None: - return None - - def apply( - self, - layer: FusedMoE, - x: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor: - if layer.apply_router_weight_on_input: - raise NotImplementedError( - "Apply router weight on input is not supported for" - "fused GGUF MoE method." - ) - - return fused_moe_gguf( - x, - layer.w13_qweight, - layer.w2_qweight, - topk_weights, - topk_ids, - layer.w13_qweight_type.weight_type, - layer.w2_qweight_type.weight_type, - layer.activation.value, - ) - - -class GGUFEmbeddingMethod(GGUFLinearMethod): - """Embedding method for GGUF. - - Args: - quant_config: The GGUF quantization config. - """ - - def embedding(self, layer: torch.nn.Module, x: torch.Tensor) -> torch.Tensor: - qweight = layer.qweight - qweight_type = layer.qweight_type.weight_type - hidden_size = qweight.tensor_shape[1] - - return apply_gguf_embedding( - x, qweight, qweight_type, hidden_size, dtype=self.params_dtype - ) - - -class GGUFUninitializedParameter(UninitializedParameter): - cls_to_become = Parameter - data_container: list[torch.Tensor] diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index c4fbe0962e06..24c02a49448f 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -418,8 +418,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): output_dim = getattr(param, "output_dim", None) packed_dim = getattr(param, "packed_dim", None) - # If the parameter is a gguf weight, then load it directly. - if getattr(param, "is_gguf_weight_type", None): + if getattr(param, "needs_custom_weight_type", None): param.data.copy_(loaded_weight) param.weight_type = loaded_weight.item() return @@ -549,8 +548,7 @@ def __init__( def tie_weights(self, embed_tokens: VocabParallelEmbedding): """Tie the weights with word embeddings.""" - # GGUF quantized embed_tokens. - if self.quant_config and self.quant_config.get_name() == "gguf": + if self.quant_config and self.quant_config.should_keep_tied_lm_head(): return embed_tokens else: self.weight = embed_tokens.weight diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index 53b6b3221b54..40be772e220a 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -12,7 +12,6 @@ from vllm.model_executor.model_loader.bitsandbytes_loader import BitsAndBytesModelLoader from vllm.model_executor.model_loader.default_loader import DefaultModelLoader from vllm.model_executor.model_loader.dummy_loader import DummyModelLoader -from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader from vllm.model_executor.model_loader.runai_streamer_loader import ( RunaiModelStreamerLoader, ) @@ -34,7 +33,6 @@ "bitsandbytes", "dummy", "fastsafetensors", - "gguf", "instanttensor", "mistral", "npcache", @@ -51,7 +49,6 @@ "bitsandbytes": BitsAndBytesModelLoader, "dummy": DummyModelLoader, "fastsafetensors": DefaultModelLoader, - "gguf": GGUFModelLoader, "instanttensor": DefaultModelLoader, "mistral": DefaultModelLoader, "npcache": DefaultModelLoader, @@ -149,7 +146,6 @@ def get_model( "register_model_loader", "BaseModelLoader", "BitsAndBytesModelLoader", - "GGUFModelLoader", "DefaultModelLoader", "DummyModelLoader", "RunaiModelStreamerLoader", diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py deleted file mode 100644 index ce6a813b8da5..000000000000 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ /dev/null @@ -1,436 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os -from collections.abc import Generator -from typing import TYPE_CHECKING, cast - -import gguf -import regex as re -import torch -import torch.nn as nn -from huggingface_hub import hf_hub_download -from transformers import AutoModelForCausalLM, AutoModelForImageTextToText - -from vllm.config import ModelConfig, VllmConfig -from vllm.config.load import LoadConfig -from vllm.logger import init_logger -from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, - process_weights_after_loading, -) -from vllm.model_executor.model_loader.weight_utils import ( - download_gguf, - get_gguf_extra_tensor_names, - get_gguf_weight_type_map, - gguf_quant_weights_iterator, - gguf_quant_weights_iterator_multi, -) -from vllm.transformers_utils.gguf_utils import detect_gguf_multimodal -from vllm.utils.torch_utils import set_default_torch_dtype - -if TYPE_CHECKING: - from vllm.model_executor.layers.quantization.gguf import GGUFConfig - -logger = init_logger(__name__) - - -class GGUFModelLoader(BaseModelLoader): - """ - Model loader that can load GGUF files. This is useful for loading models - that are quantized with GGUF and saved in the GGUF format. This loader - supports loading both full models and sharded models. - """ - - def __init__(self, load_config: LoadConfig): - super().__init__(load_config) - if load_config.model_loader_extra_config: - raise ValueError( - f"Model loader extra config is not supported for " - f"load format {load_config.load_format}" - ) - - def _prepare_weights(self, model_config: ModelConfig): - model_name_or_path = model_config.model - if os.path.isfile(model_name_or_path): - return model_name_or_path - # repo id/filename.gguf - if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"): - repo_id, filename = model_name_or_path.rsplit("/", 1) - return hf_hub_download(repo_id=repo_id, filename=filename) - # repo_id:quant_type - elif "/" in model_name_or_path and ":" in model_name_or_path: - repo_id, quant_type = model_name_or_path.rsplit(":", 1) - return download_gguf( - repo_id, - quant_type, - cache_dir=self.load_config.download_dir, - revision=model_config.revision, - ignore_patterns=self.load_config.ignore_patterns, - ) - - raise ValueError( - f"Unrecognised GGUF reference: {model_name_or_path} " - "(expected local file, /.gguf, " - "or :)" - ) - - @staticmethod - def _get_all_gguf_files(model_path: str) -> list[str]: - """Discover all GGUF shard files from a single shard path. - - Supports variable-width shard indices by dynamically detecting - the padding from the original filename. - E.g. ``*-00001-of-00005.gguf`` → all 5 shards, - ``*-01-of-15.gguf`` → all 15 shards. - """ - match = re.search(r"-(\d+)-of-(\d+)\.gguf$", model_path) - if not match: - return [model_path] - total = int(match.group(2)) - num_digits = len(match.group(1)) - prefix = model_path[: match.start(1)] - suffix = model_path[match.end(2) :] - files = [] - for i in range(1, total + 1): - shard_path = f"{prefix}{i:0{num_digits}d}-of-{total:0{num_digits}d}{suffix}" - if os.path.isfile(shard_path): - files.append(shard_path) - if files: - logger.info("Discovered %d GGUF shard files", len(files)) - return files if files else [model_path] - - def _get_gguf_weights_map(self, model_config: ModelConfig): - """ - GGUF uses this naming convention for their tensors from HF checkpoint: - `blk.N.BB.weight` and `blk.N.BB.bias` - where N signifies the block number of a layer, and BB signifies the - attention/mlp layer components. - See "Standardized tensor names" in - https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details. - """ - config = model_config.hf_config - # Get text config to handle both nested (multimodal) and flat - # (text-only) config structures. For multimodal models like - # Gemma3Config, this returns config.text_config. For text-only - # models, this returns config itself. - text_config = config.get_text_config() - model_type = config.model_type - is_multimodal = ( - hasattr(config, "vision_config") and config.vision_config is not None - ) - gguf_to_hf_name_map = {} - sideload_params: list[re.Pattern] = [] - # hack: ggufs have a different name than transformers - if model_type == "cohere": - model_type = "command-r" - if model_type == "gemma3_text": - # Gemma3 models use "gemma3_text" in HuggingFace but - # "gemma3" in GGUF architecture naming - model_type = "gemma3" - if model_type in ("deepseek_v3", "deepseek_v2"): - model_type = "deepseek2" - # GGUF layer map assumes that we will have a merged expert weights - # so we need to map them manually - for idx in range(config.num_hidden_layers): - gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = ( - f"model.layers.{idx}.mlp.gate.e_score_correction_bias" - ) - gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = ( - f"model.layers.{idx}.mlp.experts.0.down_proj.weight" - ) - gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = ( - f"model.layers.{idx}.mlp.experts.0.gate_proj.weight" - ) - gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( - f"model.layers.{idx}.mlp.experts.0.up_proj.weight" - ) - sideload_params.append( - re.compile( - f"model\\.layers\\.{idx}" - r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight" - ) - ) - if model_type in ("qwen2_moe", "qwen3_moe"): - model_type = model_type.replace("_", "") - # GGUF layer map assumes that we will have a merged expert weights - # so we need to map them manually - for idx in range(config.num_hidden_layers): - gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = ( - f"model.layers.{idx}.mlp.experts.0.down_proj.weight" - ) - gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = ( - f"model.layers.{idx}.mlp.experts.0.gate_proj.weight" - ) - gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( - f"model.layers.{idx}.mlp.experts.0.up_proj.weight" - ) - sideload_params.append( - re.compile( - f"model\\.layers\\.{idx}" - r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight" - ) - ) - if model_type == "minimax_m2": - model_type = "minimax-m2" - # GGUF layer map assumes merged expert weights - # map them manually like deepseek2 - for idx in range(config.num_hidden_layers): - gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = ( - f"model.layers.{idx}.block_sparse_moe.e_score_correction_bias" - ) - gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = ( - f"model.layers.{idx}.block_sparse_moe.experts.0.w2.weight" - ) - gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = ( - f"model.layers.{idx}.block_sparse_moe.experts.0.w1.weight" - ) - gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( - f"model.layers.{idx}.block_sparse_moe.experts.0.w3.weight" - ) - sideload_params.append( - re.compile( - f"model\\.layers\\.{idx}" - r"\.block_sparse_moe\.experts\.(gate_up_proj|down_proj)" - ) - ) - - arch = None - for key, value in gguf.MODEL_ARCH_NAMES.items(): - if value == model_type: - arch = key - break - if arch is None: - raise RuntimeError(f"Unknown gguf model_type: {model_type}") - text_num_layers = text_config.num_hidden_layers - text_name_map = gguf.get_tensor_name_map(arch, text_num_layers) - - if is_multimodal: - mm_proj_arch = gguf.MODEL_ARCH.MMPROJ - vision_num_layers = config.vision_config.num_hidden_layers - vision_name_map = gguf.get_tensor_name_map(mm_proj_arch, vision_num_layers) - else: - vision_name_map = None - - # Create dummy model to extract parameter names - # For multimodal: use AutoModelForImageTextToText to get - # language + vision + projector params - # For text-only: use AutoModelForCausalLM to get language model params - auto_cls = ( - AutoModelForImageTextToText if is_multimodal else AutoModelForCausalLM - ) - with torch.device("meta"): - dummy_model = auto_cls.from_config( - config, trust_remote_code=model_config.trust_remote_code - ) - - state_dict = dummy_model.state_dict() - if hf_checkpoint_map := getattr( - dummy_model, "_checkpoint_conversion_mapping", None - ): - - def revert_hf_rename(name: str) -> str: - for original_name, hf_name in hf_checkpoint_map.items(): - if hf_name in name: - name = name.replace(hf_name, original_name).lstrip("^") - return name - - state_dict = { - revert_hf_rename(name): tensor for name, tensor in state_dict.items() - } - - if model_type == "minimax-m2" and not hf_checkpoint_map: - # Reverse HF convention: mlp -> block_sparse_moe - state_dict = { - name.replace(".mlp.", ".block_sparse_moe."): tensor - for name, tensor in state_dict.items() - } - - def find_hf_name_in_tensor_map(hf_name: str) -> str | None: - """ - Map HuggingFace parameter name to GGUF tensor name. - - This function handles the mismatch between HF parameter naming - conventions and gguf-py's expected format: - 1. Strips 'model.' prefix (common in multimodal models) - 2. Converts '_weight' suffix to '.weight' (Gemma3 compatibility) - 3. Searches vision_name_map for multimodal parameters - 4. Falls back to text_name_map for language model parameters - - Args: - hf_name: Full HuggingFace parameter name (e.g., - 'model.multi_modal_projector.mm_soft_emb_norm.weight') - - Returns: - GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight') - or None if no mapping found - """ - # Strip 'language_model.' prefix for multimodal models - gguf-py - # tensor mappings expect parameter names without this prefix. - # Note: 'model.' prefix should be KEPT for text-only models as - # gguf-py expects it. - if hf_name.startswith("language_model."): - hf_name = hf_name[15:] # Remove 'language_model.' - - # Parse parameter name and suffix - if hf_name.endswith((".weight", ".bias")): - base_name, suffix = hf_name.rsplit(".", 1) - else: - base_name, suffix = hf_name, "" - # Handle '_weight' suffix (Gemma3 naming: parameter ends with - # '_weight' instead of '.weight') - if base_name.endswith("_weight"): - base_name = base_name[:-7] # Remove '_weight' - suffix = "weight" - - gguf_name = None - # Priority 1: Search vision/projector parameters for multimodal models - if vision_name_map is not None: - gguf_name = vision_name_map.get_name(base_name) - - # Priority 2: Search text backbone parameters - if gguf_name is None: - gguf_name = text_name_map.get_name(base_name) - - if gguf_name is None: - return None - - return gguf_name + "." + suffix - - # Build mapping and track unmapped parameters - unmapped_params = [] - for hf_name in state_dict: - gguf_name_with_suffix = find_hf_name_in_tensor_map(hf_name) - - # Track mapping success - if gguf_name_with_suffix is not None: - gguf_to_hf_name_map[gguf_name_with_suffix] = hf_name - logger.debug("Mapped GGUF %s → HF %s", gguf_name_with_suffix, hf_name) - elif hf_name not in gguf_to_hf_name_map.values(): - # Parameter not in manual overrides either - unmapped_params.append(hf_name) - - # All parameters (except those initialized by other means) must be mapped: - # both vision/projector and backbone - if unmapped_params: - unmapped_params = list( - filter( - lambda x: not any(re.fullmatch(p, x) for p in sideload_params), - unmapped_params, - ) - ) - if unmapped_params: - raise RuntimeError( - f"Failed to map GGUF parameters " - f"({len(unmapped_params)}): " - f"{unmapped_params}" - ) - return gguf_to_hf_name_map - - def _get_gguf_weight_type( - self, - model_config: ModelConfig, - model_name_or_path: str, - gguf_to_hf_name_map: dict[str, str], - ) -> dict[str, str]: - gguf_files = self._get_all_gguf_files(model_name_or_path) - weight_type_map = {} - for f in gguf_files: - weight_type_map.update(get_gguf_weight_type_map(f, gguf_to_hf_name_map)) - is_multimodal = hasattr(model_config.hf_config, "vision_config") - if is_multimodal: - mmproj_file = detect_gguf_multimodal(model_name_or_path) - assert mmproj_file is not None, ( - "Could not find mm_proj file for multimodal GGUF model" - ) - logger.info("Loading extra mm_proj weights from %s...", mmproj_file) - mm_proj_weight_type_map = get_gguf_weight_type_map( - mmproj_file, gguf_to_hf_name_map - ) - weight_type_map.update(mm_proj_weight_type_map) - return weight_type_map - - def _get_weights_iterator( - self, - model_config: ModelConfig, - model_name_or_path: str, - gguf_to_hf_name_map: dict[str, str], - ) -> Generator[tuple[str, torch.Tensor], None, None]: - """ - Iterate over GGUF model weights, loading from both main model file and - mmproj.gguf for multimodal Gemma3 models. - - For Gemma3 multimodal GGUF models: - - Main file (gemma-3-*.gguf): Language model weights (model.*) - - mmproj file (mmproj*.gguf): Vision tower + projector weights (v.*, mm.*) - - Yields: - Tuples of (parameter_name, tensor) for all model weights - """ - hf_config = model_config.hf_config - is_multimodal = hasattr(hf_config, "vision_config") - - if is_multimodal: - # Load mm_proj (mm_encoder + projector) for multimodal weights - mmproj_file = detect_gguf_multimodal(model_name_or_path) - assert mmproj_file is not None, ( - "Could not find mm_proj file for multimodal GGUF model" - ) - yield from gguf_quant_weights_iterator(mmproj_file, gguf_to_hf_name_map) - - gguf_files = self._get_all_gguf_files(model_name_or_path) - if len(gguf_files) > 1: - yield from gguf_quant_weights_iterator_multi( - gguf_files, gguf_to_hf_name_map - ) - else: - yield from gguf_quant_weights_iterator( - model_name_or_path, gguf_to_hf_name_map - ) - - def download_model(self, model_config: ModelConfig) -> None: - self._prepare_weights(model_config) - - def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: - local_model_path = self._prepare_weights(model_config) - gguf_weights_map = self._get_gguf_weights_map(model_config) - model.load_weights( - self._get_weights_iterator(model_config, local_model_path, gguf_weights_map) - ) - - def load_model( - self, vllm_config: VllmConfig, model_config: ModelConfig, prefix: str = "" - ) -> nn.Module: - device_config = vllm_config.device_config - local_model_path = self._prepare_weights(model_config) - gguf_weights_map = self._get_gguf_weights_map(model_config) - # we can only know if tie word embeddings after mapping weights - gguf_files = self._get_all_gguf_files(local_model_path) - all_extra_names = [] - for f in gguf_files: - all_extra_names.extend(get_gguf_extra_tensor_names(f, gguf_weights_map)) - if "lm_head.weight" in all_extra_names: - model_config.hf_config.update({"tie_word_embeddings": True}) - - weight_type_map = self._get_gguf_weight_type( - model_config, local_model_path, gguf_weights_map - ) - # filter out unquantized modules to skip - unquant_names = [ - name.removesuffix(".weight") - for name, weight_type in weight_type_map.items() - if weight_type in ("F32", "F16", "BF16") and name.endswith(".weight") - ] - logger.debug("GGUF unquantized modules: %s", unquant_names) - if TYPE_CHECKING: - vllm_config.quant_config = cast(GGUFConfig, vllm_config.quant_config) - vllm_config.quant_config.unquantized_modules.extend(unquant_names) - - target_device = torch.device(device_config.device) - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config, prefix=prefix) - self.load_weights(model, model_config) - - process_weights_after_loading(model, model_config, target_device) - return model diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 3b961e8e143d..45d83e2f7179 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -50,11 +50,6 @@ runai_model_streamer = PlaceholderModule("runai_model_streamer") # type: ignore[assignment] SafetensorsStreamer = runai_model_streamer.placeholder_attr("SafetensorsStreamer") -try: - import gguf -except ImportError: - gguf = PlaceholderModule("gguf") - try: from fastsafetensors import SafeTensorsFileLoader, SingleGroup except ImportError: @@ -263,9 +258,8 @@ def get_quant_config( raise ValueError("Model quantization method is not specified in the config.") quant_cls = get_quantization_config(model_config.quantization) - # GGUF doesn't have config file - if model_config.quantization == "gguf": - return quant_cls() + if not quant_cls.requires_hf_quant_config(): + return quant_cls.from_config({}) # Read the quantization config from the HF model config, if available. hf_quant_config = getattr(model_config.hf_config, "quantization_config", None) @@ -466,52 +460,6 @@ def get_sparse_attention_config( return config -def download_gguf( - repo_id: str, - quant_type: str, - cache_dir: str | None = None, - revision: str | None = None, - ignore_patterns: str | list[str] | None = None, -) -> str: - # Use patterns that snapshot_download can handle directly - # Patterns to match: - # - *-{quant_type}.gguf (root) - # - *-{quant_type}-*.gguf (root sharded) - # - */*-{quant_type}.gguf (subdir) - # - */*-{quant_type}-*.gguf (subdir sharded) - allow_patterns = [ - f"*-{quant_type}.gguf", - f"*-{quant_type}-*.gguf", - f"*/*-{quant_type}.gguf", - f"*/*-{quant_type}-*.gguf", - ] - - # Use download_weights_from_hf which handles caching and downloading - folder = download_weights_from_hf( - model_name_or_path=repo_id, - cache_dir=cache_dir, - allow_patterns=allow_patterns, - revision=revision, - ignore_patterns=ignore_patterns, - ) - - # Find the downloaded file(s) in the folder - local_files = [] - for pattern in allow_patterns: - # Convert pattern to glob pattern for local filesystem - glob_pattern = os.path.join(folder, pattern) - local_files.extend(glob.glob(glob_pattern)) - - if not local_files: - raise ValueError( - f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}" - ) - - # Sort to ensure consistent ordering (prefer non-sharded files) - local_files.sort(key=lambda x: (x.count("-"), x)) - return local_files[0] - - @instrument(span_name="Download weights - HF") def download_weights_from_hf( model_name_or_path: str, @@ -1231,118 +1179,6 @@ def _load_file(bin_file: str): del state -def get_gguf_extra_tensor_names( - gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str] -) -> list[str]: - reader = gguf.GGUFReader(gguf_file) - expected_gguf_keys = set(gguf_to_hf_name_map.keys()) - exact_gguf_keys = set([tensor.name for tensor in reader.tensors]) - extra_keys = expected_gguf_keys - exact_gguf_keys - return [gguf_to_hf_name_map[key] for key in extra_keys] - - -def get_gguf_weight_type_map( - gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str] -) -> dict[str, str]: - """ - Return GGUF mapped weight's name and its quant type - """ - reader = gguf.GGUFReader(gguf_file) - return { - gguf_to_hf_name_map[tensor.name]: tensor.tensor_type.name - for tensor in reader.tensors - if tensor.name in gguf_to_hf_name_map - } - - -def gguf_quant_weights_iterator( - gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str] -) -> Generator[tuple[str, torch.Tensor], None, None]: - """ - Iterate over the quant weights in the model gguf files and convert - them to torch tensors. - Be careful of the order of yielding weight types and weights data, - we have to yield all weight types first before yielding any weights. - Otherwise it would cause issue when loading weights with for packed - layer with different quant types. - """ - - reader = gguf.GGUFReader(gguf_file) - - for tensor in reader.tensors: - if tensor.name in gguf_to_hf_name_map: - weight_type = tensor.tensor_type - name = gguf_to_hf_name_map[tensor.name] - - if weight_type.name not in ("F32", "BF16", "F16"): - weight_type_name = name.replace("weight", "qweight_type") - weight_type = torch.tensor(weight_type) - yield weight_type_name, weight_type - - for tensor in reader.tensors: - if tensor.name in gguf_to_hf_name_map: - weight = tensor.data - weight_type = tensor.tensor_type - name = gguf_to_hf_name_map[tensor.name] - if weight_type.name not in ("F32", "BF16", "F16"): - name = name.replace("weight", "qweight") - if weight_type.name == "BF16" and tensor.data.dtype == np.uint8: - # BF16 is currently the only "quantization" type that isn't - # actually quantized but is read as a raw byte tensor. - # Reinterpret as `torch.bfloat16` tensor. - weight = weight.view(np.uint16) - if reader.byte_order == "S": - # GGUF endianness != system endianness - weight = weight.byteswap() - param = torch.tensor(weight).view(torch.bfloat16) - else: - param = torch.tensor(weight) - yield name, param - - -def gguf_quant_weights_iterator_multi( - gguf_files: list[str], gguf_to_hf_name_map: dict[str, str] -) -> Generator[tuple[str, torch.Tensor], None, None]: - """ - Iterate over the quant weights across multiple GGUF shard files - and convert them to torch tensors. - - Like gguf_quant_weights_iterator, we yield all weight types first - before yielding any weights data to avoid issues with packed layers - that have different quant types. - """ - readers = [gguf.GGUFReader(f) for f in gguf_files] - - # First pass: yield all weight types across all shards - for reader in readers: - for tensor in reader.tensors: - if tensor.name in gguf_to_hf_name_map: - weight_type = tensor.tensor_type - name = gguf_to_hf_name_map[tensor.name] - if weight_type.name not in ("F32", "BF16", "F16"): - weight_type_name = name.replace("weight", "qweight_type") - weight_type = torch.tensor(weight_type) - yield weight_type_name, weight_type - - # Second pass: yield all weight data across all shards - for reader in readers: - for tensor in reader.tensors: - if tensor.name in gguf_to_hf_name_map: - weight = tensor.data - weight_type = tensor.tensor_type - name = gguf_to_hf_name_map[tensor.name] - if weight_type.name not in ("F32", "BF16", "F16"): - name = name.replace("weight", "qweight") - if weight_type.name == "BF16" and tensor.data.dtype == np.uint8: - weight = weight.view(np.uint16) - if reader.byte_order == "S": - weight = weight.byteswap() - param = torch.tensor(weight).view(torch.bfloat16) - else: - param = torch.tensor(weight) - yield name, param - - def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: """convert PySafeSlice object from safetensors to torch.Tensor diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 5905a198b289..7254f41fd2f0 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -228,9 +228,10 @@ def _init_rotary_emb( quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True - is_gguf = quant_config and quant_config.get_name() == "gguf" - if is_gguf and config.model_type == "apertus": - is_neox_style = False + if quant_config is not None: + override = quant_config.override_is_neox_style(config.model_type) + if override is not None: + is_neox_style = override self.rotary_emb = get_rope( self.head_dim, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index b633fd285082..4a56aa676073 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -162,8 +162,10 @@ def __init__( ) is_neox_style = True - if quant_config is not None and quant_config.get_name() == "gguf": - is_neox_style = False + if quant_config is not None: + override = quant_config.override_is_neox_style(config.model_type) + if override is not None: + is_neox_style = override self.rotary_emb = get_rope( self.head_dim, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 04708de93d39..51b317a51229 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -168,8 +168,10 @@ def __init__( self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) is_neox_style = True - if quant_config is not None and quant_config.get_name() == "gguf": - is_neox_style = False + if quant_config is not None: + override = quant_config.override_is_neox_style(config.model_type) + if override is not None: + is_neox_style = override layer_idx = extract_layer_index(prefix) is_sliding = config.layer_types[layer_idx] == "sliding_attention" diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index b2352a3c9268..5b3f0688de4d 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -383,14 +383,10 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - # Revert +1 during llama.cpp conversion - # see: https://github.com/ggml-org/llama.cpp/blob/be7c3034108473beda214fd1d7c98fd6a7a3bdf5/convert_hf_to_gguf.py#L3397-L3400 - if ( - self.quant_config - and self.quant_config.get_name() == "gguf" - and name.endswith("norm.weight") - ): - loaded_weight -= 1 + if self.quant_config is not None: + loaded_weight = self.quant_config.transform_loaded_weight( + name, loaded_weight + ) if self.quant_config is not None and ( scale_name := self.quant_config.get_cache_scale(name) diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 4e03eb12ee44..0f3ec5315847 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -161,8 +161,10 @@ def __init__( ) is_neox_style = True - if quant_config is not None and quant_config.get_name() == "gguf": - is_neox_style = False + if quant_config is not None: + override = quant_config.override_is_neox_style(config.model_type) + if override is not None: + is_neox_style = override self.rotary_emb = get_rope( self.head_dim, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2ecced3df8ba..9a3456b4fe86 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -238,9 +238,10 @@ def _init_rotary_emb( quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True - is_gguf = quant_config and quant_config.get_name() == "gguf" - if is_gguf and config.model_type == "llama": - is_neox_style = False + if quant_config is not None: + override = quant_config.override_is_neox_style(config.model_type) + if override is not None: + is_neox_style = override self.rotary_emb = get_rope( self.head_dim, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index b84b4e2ae512..2c20c0673e2e 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -240,9 +240,10 @@ def __init__( prefix=f"{prefix}.o_proj", ) is_neox_style = True - is_gguf = quant_config and quant_config.get_name() == "gguf" - if is_gguf and config.model_type == "llama": - is_neox_style = False + if quant_config is not None: + override = quant_config.override_is_neox_style(config.model_type) + if override is not None: + is_neox_style = override self.rotary_emb = ( get_rope( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 994ae82529ab..7f84f324eb09 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -533,9 +533,10 @@ def _init_rotary_emb( quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True - is_gguf = quant_config and quant_config.get_name() == "gguf" - if is_gguf and config.model_type == "PanguEmbedded": - is_neox_style = False + if quant_config is not None: + override = quant_config.override_is_neox_style(config.model_type) + if override is not None: + is_neox_style = override rope_parameters = config.rope_parameters or {} if rope_parameters is not None and rope_parameters.get( @@ -732,14 +733,16 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): # no need to narrow is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit - # Special case for GGUF - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type: + needs_custom_weight_materialization = getattr( + param, "needs_custom_weight_materialization", False + ) + needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) + if needs_custom_weight_type: param.weight_type = loaded_weight.item() - # Materialize GGUF UninitializedParameter - if is_gguf_weight and isinstance(param, nn.UninitializedParameter): + if needs_custom_weight_materialization and isinstance( + param, nn.UninitializedParameter + ): final_shape = list(loaded_weight.shape) if output_dim is not None: assert final_shape[output_dim] % self.tp_size == 0 diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index ce3a260d0ef6..c50d737878f5 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -968,20 +968,9 @@ def maybe_swap_ffn_param( params_dict: dict[str, torch.Tensor], quant_config: QuantizationConfig, ) -> torch.Tensor: - if not (quant_config and quant_config.get_name() == "gguf") or ".fc" not in name: + if quant_config is None or ".fc" not in name: return param - # Some GGUF models have fc1 and fc2 weights swapped - tp_size = get_tensor_model_parallel_world_size() - output_dim = getattr(param, "output_dim", 0) - output_size = param.size(output_dim) * tp_size - weight_out_size = loaded_weight.size(output_dim) - if ".fc1." in name and output_size != weight_out_size: - new_name = name.replace(".fc1.", ".fc2.") - param = params_dict[new_name] - elif ".fc2." in name and output_size != weight_out_size: - new_name = name.replace(".fc2.", ".fc1.") - param = params_dict[new_name] - return param + return quant_config.remap_loaded_parameter(name, param, loaded_weight, params_dict) # Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200 diff --git a/vllm/model_format.py b/vllm/model_format.py new file mode 100644 index 000000000000..cb77983a9470 --- /dev/null +++ b/vllm/model_format.py @@ -0,0 +1,162 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from vllm.logger import init_logger + +if TYPE_CHECKING: + from transformers import PretrainedConfig + +logger = init_logger(__name__) + + +class ModelFormatHandler: + """Extension hook for out-of-tree model formats. + + Handlers can customize how a model reference is interpreted across vLLM, + such as model/config discovery, tokenizer and processor resolution, and + engine-arg defaults. + """ + + name: str = "" + + def matches(self, model: str | Path | None) -> bool: + return False + + def update_engine_args(self, engine_args: Any) -> None: + return + + def prepare_hf_config_load( + self, + model: str | Path, + revision: str | None = None, + kwargs: dict[str, Any] | None = None, + ) -> tuple[str | Path, dict[str, Any]]: + return model, kwargs or {} + + def should_use_hf_config_parser( + self, + original_model: str | Path, + resolved_model: str | Path, + ) -> bool: + return False + + def get_missing_hf_config_error( + self, + original_model: str | Path, + resolved_model: str | Path, + ) -> str | None: + return None + + def patch_parsed_hf_config( + self, + original_model: str | Path, + config_dict: dict[str, Any], + config: "PretrainedConfig", + ) -> "PretrainedConfig": + return config + + def patch_model_hf_config( + self, + original_model: str | Path, + hf_config: "PretrainedConfig", + ) -> "PretrainedConfig": + return hf_config + + def resolve_tokenizer_init( + self, + tokenizer_name: str | Path, + *args: Any, + revision: str | None = None, + runner_type: str = "generate", + tokenizer_mode: str = "auto", + **kwargs: Any, + ) -> tuple[str | Path, tuple[Any, ...], dict[str, Any]]: + return tokenizer_name, args, kwargs + + def resolve_processor_source( + self, + model_config: Any, + component: str, + ) -> tuple[str | Path, str | None]: + return model_config.model, model_config.revision + + def validate_model_config(self, model_config: Any) -> None: + return + + def resolve_sentence_transformer_source( + self, + model: str | Path, + revision: str | None = None, + ) -> str | Path: + return model + + def resolve_image_processor_source( + self, + model: str | Path, + revision: str | None = None, + ) -> str | Path: + return model + + def should_skip_generation_config(self, model: str | Path) -> bool: + return False + + +_MODEL_FORMAT_HANDLERS: list[ModelFormatHandler] = [] + + +def register_model_format(handler: ModelFormatHandler) -> ModelFormatHandler: + if not isinstance(handler, ModelFormatHandler): + raise ValueError("The model format handler must subclass `ModelFormatHandler`.") + + replaced = False + if handler.name: + for idx, existing in enumerate(_MODEL_FORMAT_HANDLERS): + if existing.name == handler.name: + logger.warning( + "The model format handler %r already exists and will be " + "overwritten by %s.", + handler.name, + type(handler), + ) + _MODEL_FORMAT_HANDLERS[idx] = handler + replaced = True + break + + if not replaced: + _MODEL_FORMAT_HANDLERS.append(handler) + + return handler + + +def get_model_format_handler(model: str | Path | None) -> ModelFormatHandler | None: + for handler in reversed(_MODEL_FORMAT_HANDLERS): + if handler.matches(model): + return handler + return None + + +def prepare_hf_model_reference( + model: str | Path, + revision: str | None = None, + **kwargs: Any, +) -> tuple[ModelFormatHandler | None, str | Path, dict[str, Any]]: + handler = get_model_format_handler(model) + if handler is None: + return None, model, kwargs + resolved_model, resolved_kwargs = handler.prepare_hf_config_load( + model, + revision=revision, + kwargs=kwargs, + ) + return handler, resolved_model, resolved_kwargs + + +__all__ = [ + "ModelFormatHandler", + "get_model_format_handler", + "prepare_hf_model_reference", + "register_model_format", +] diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index bb525d9251f4..c3d9d49428fe 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -409,7 +409,6 @@ class RocmPlatform(Platform): "fp8", "compressed-tensors", "fbgemm_fp8", - "gguf", "quark", "mxfp4", "torchao", diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index 7d48e3c6ff91..c4ee1ad9a310 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -10,13 +10,7 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.transformers_utils.gguf_utils import ( - check_gguf_file, - get_gguf_file_path_from_hf, - is_gguf, - is_remote_gguf, - split_remote_gguf, -) +from vllm.model_format import get_model_format_handler from vllm.transformers_utils.repo_utils import ( any_pattern_in_repo_files, is_mistral_model_repo, @@ -115,20 +109,15 @@ def resolve_tokenizer_args( ) tokenizer_name = tokenizer_path - # Separate model folder from file path for GGUF models - if is_gguf(tokenizer_name): - if check_gguf_file(tokenizer_name): - kwargs["gguf_file"] = Path(tokenizer_name).name - tokenizer_name = Path(tokenizer_name).parent - elif is_remote_gguf(tokenizer_name): - tokenizer_name, quant_type = split_remote_gguf(tokenizer_name) - # Get the HuggingFace Hub path for the GGUF file - gguf_file = get_gguf_file_path_from_hf( - tokenizer_name, - quant_type, - revision=revision, - ) - kwargs["gguf_file"] = gguf_file + if handler := get_model_format_handler(tokenizer_name): + tokenizer_name, args, kwargs = handler.resolve_tokenizer_init( + tokenizer_name, + *args, + revision=revision, + runner_type=runner_type, + tokenizer_mode=tokenizer_mode, + **kwargs, + ) if "truncation_side" not in kwargs: if runner_type == "generate" or runner_type == "draft": diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 5f4b5a3b2a48..fa8beacfebcf 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -18,7 +18,6 @@ from transformers import GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( - MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES, ) from transformers.models.auto.tokenization_auto import get_tokenizer_config @@ -26,6 +25,10 @@ from vllm import envs from vllm.logger import init_logger +from vllm.model_format import ( + get_model_format_handler, + prepare_hf_model_reference, +) from vllm.transformers_utils.repo_utils import is_mistral_model_repo from vllm.transformers_utils.utils import ( parse_safetensors_file_metadata, @@ -34,12 +37,6 @@ from vllm.utils.torch_utils import common_broadcastable_dtype from .config_parser_base import ConfigParserBase -from .gguf_utils import ( - check_gguf_file, - is_gguf, - is_remote_gguf, - split_remote_gguf, -) from .repo_utils import ( file_or_path_exists, get_hf_file_to_dict, @@ -573,20 +570,17 @@ def maybe_override_with_speculators( Returns: Tuple of (resolved_model, resolved_tokenizer, speculative_config) """ - if check_gguf_file(model): - kwargs["gguf_file"] = Path(model).name - gguf_model_repo = Path(model).parent - elif is_remote_gguf(model): - repo_id, _ = split_remote_gguf(model) - gguf_model_repo = Path(repo_id) - else: - gguf_model_repo = None kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE + _, resolved_model, resolved_kwargs = prepare_hf_model_reference( + model, + revision=revision, + **kwargs, + ) config_dict, _ = PretrainedConfig.get_config_dict( - model if gguf_model_repo is None else gguf_model_repo, + resolved_model, revision=revision, token=hf_token, - **without_trust_remote_code(kwargs), + **without_trust_remote_code(resolved_kwargs), ) speculators_config = config_dict.get("speculators_config") @@ -621,23 +615,20 @@ def get_config( hf_overrides_fn: Callable[[PretrainedConfig], PretrainedConfig] | None = None, **kwargs, ) -> PretrainedConfig: - # Separate model folder from file path for GGUF models - - _is_gguf = is_gguf(model) - _is_remote_gguf = is_remote_gguf(model) - if _is_gguf: - if check_gguf_file(model): - # Local GGUF file - kwargs["gguf_file"] = Path(model).name - model = Path(model).parent - elif _is_remote_gguf: - # Remote GGUF - extract repo_id from repo_id:quant_type format - # The actual GGUF file will be downloaded later by GGUFModelLoader - # Keep model as repo_id:quant_type for download, but use repo_id for config - model, _ = split_remote_gguf(model) + original_model = model + handler, model, kwargs = prepare_hf_model_reference( + model, + revision=revision, + **kwargs, + ) if config_format == "auto": try: + use_hf_parser_without_config = ( + handler.should_use_hf_config_parser(original_model, model) + if handler is not None + else False + ) # First check for Mistral to avoid defaulting to # Transformers implementation. if is_mistral_model_repo( @@ -646,26 +637,22 @@ def get_config( model=model, config_name=MISTRAL_CONFIG_NAME, revision=revision ): config_format = "mistral" - elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists( + elif use_hf_parser_without_config or file_or_path_exists( model, HF_CONFIG_NAME, revision=revision ): config_format = "hf" - # Remote GGUF models must have config.json in repo, - # otherwise the config can't be parsed correctly. - # FIXME(Isotr0py): Support remote GGUF repos without config.json - elif _is_remote_gguf and not file_or_path_exists( - model, HF_CONFIG_NAME, revision=revision - ): - err_msg = ( - "Could not find config.json for remote GGUF model repo. " - "To load remote GGUF model through `:`, " - "ensure your model has config.json (HF format) file. " - "Otherwise please specify --hf-config-path " - "in engine args to fetch config from unquantized hf model." - ) - logger.error(err_msg) - raise ValueError(err_msg) else: + if ( + handler is not None + and ( + err_msg := handler.get_missing_hf_config_error( + original_model, model + ) + ) + is not None + ): + logger.error(err_msg) + raise ValueError(err_msg) raise ValueError( "Could not detect config format for no config file found. " "With config_format 'auto', ensure your model has either " @@ -685,7 +672,7 @@ def get_config( "'config.json'.\n" " - For Mistral models: ensure the presence of a " "'params.json'.\n" - ).format(model=model) + ).format(model=original_model) raise ValueError(error_message) from e @@ -698,34 +685,8 @@ def get_config( hf_overrides=hf_overrides_kw or hf_overrides_fn, **kwargs, ) - - # Patching defaults for GGUF models - if _is_gguf: - # Some models have different default values between GGUF and HF. - def apply_gguf_default(key: str, gguf_default: Any): - """ - Apply GGUF defaults unless explicitly configured. - - This function reads/writes external `config` and `config_dict`. - If the specified `key` is not in `config_dict` (i.e. not explicitly - configured and the default HF value is used), it updates the - corresponding `config` value to `gguf_default`. - """ - if key not in config_dict: - config.update({key: gguf_default}) - - # Apply architecture-specific GGUF defaults. - if config.model_type in {"qwen3_moe"}: - # Qwen3 MoE: norm_topk_prob is always true. - # Note that, this parameter is always false (HF default) on Qwen2 MoE. - apply_gguf_default("norm_topk_prob", True) - - # Special architecture mapping check for GGUF models - if _is_gguf: - if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: - raise RuntimeError(f"Can't get gguf config for {config.model_type}.") - model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] - config.update({"architectures": [model_type]}) + if handler is not None: + config = handler.patch_parsed_hf_config(original_model, config_dict, config) # Architecture mapping for models without explicit architectures field if not config.architectures: @@ -818,8 +779,8 @@ def get_pooling_config( A dictionary containing the pooling type and whether normalization is used, or None if no pooling configuration is found. """ - if is_remote_gguf(model): - model, _ = split_remote_gguf(model) + if handler := get_model_format_handler(model): + model = handler.resolve_sentence_transformer_source(model, revision) modules_file_name = "modules.json" @@ -908,6 +869,9 @@ def get_sentence_transformer_tokenizer_config( - dict: A dictionary containing the configuration parameters for the Sentence Transformer BERT model. """ + if handler := get_model_format_handler(model): + model = handler.resolve_sentence_transformer_source(model, revision) + sentence_transformer_config_files = [ "sentence_bert_config.json", "sentence_roberta_config.json", @@ -1036,11 +1000,8 @@ def get_hf_image_processor_config( # ModelScope does not provide an interface for image_processor if envs.VLLM_USE_MODELSCOPE: return dict() - # Separate model folder from file path for GGUF models - if check_gguf_file(model): - model = Path(model).parent - elif is_remote_gguf(model): - model, _ = split_remote_gguf(model) + if handler := get_model_format_handler(model): + model = handler.resolve_image_processor_source(model, revision) return get_image_processor_config( model, token=hf_token, revision=revision, **kwargs ) @@ -1070,12 +1031,9 @@ def try_get_generation_config( config_format: str | ConfigFormat = "auto", hf_token: bool | str | None = None, ) -> GenerationConfig | None: - # GGUF files don't have generation_config.json - their config is embedded - # in the file header. Skip all filesystem lookups to avoid re-reading the - # memory-mapped file, which can hang in multi-process scenarios when the - # EngineCore process already has the file mapped. - if is_gguf(model): - return None + if handler := get_model_format_handler(model): + if handler.should_skip_generation_config(model): + return None try: return GenerationConfig.from_pretrained( diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py deleted file mode 100644 index 7708378ee13b..000000000000 --- a/vllm/transformers_utils/gguf_utils.py +++ /dev/null @@ -1,336 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""GGUF utility functions.""" - -from functools import cache -from os import PathLike -from pathlib import Path - -import gguf -import regex as re -from gguf.constants import Keys, VisionProjectorType -from gguf.quants import GGMLQuantizationType -from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig - -from vllm.logger import init_logger - -from .repo_utils import list_filtered_repo_files - -logger = init_logger(__name__) - - -@cache -def check_gguf_file(model: str | PathLike) -> bool: - """Check if the file is a GGUF model.""" - model = Path(model) - if not model.is_file(): - return False - elif model.suffix == ".gguf": - return True - - try: - with model.open("rb") as f: - header = f.read(4) - - return header == b"GGUF" - except Exception as e: - logger.debug("Error reading file %s: %s", model, e) - return False - - -@cache -def is_remote_gguf(model: str | Path) -> bool: - """Check if the model is a remote GGUF model. - - Recognizes two forms: - 1. Standard: ``repo_id:quant_type`` where *quant_type* is a known - GGML quantization type (e.g. ``Q4_K_M``). - 2. Non-standard: ``repo_id:quant_type`` where *quant_type* contains - a known GGML type with extra prefixes (e.g. ``UD-Q4_K_XL``). - A warning is logged and actual file existence is validated later - during download. - """ - pattern = r"^[a-zA-Z0-9][a-zA-Z0-9._-]*/[a-zA-Z0-9][a-zA-Z0-9._-]*:[A-Za-z0-9_+-]+$" - model = str(model) - if re.fullmatch(pattern, model): - _, quant_type = model.rsplit(":", 1) - if is_valid_gguf_quant_type(quant_type): - return True - if is_nonstandard_gguf_quant_type(quant_type): - logger.warning( - "Non-standard GGUF quant type '%s' detected.", - quant_type, - ) - return True - return False - - -def is_nonstandard_gguf_quant_type(quant_type: str) -> bool: - """Check if a non-standard quant type contains a known GGML type. - - Splits the quant type by the last ``-`` and checks whether the - trailing part is a standard GGML type. For example:: - - UD-Q4_K_XL → rsplit → ["UD", "Q4_K_XL"] → Q4_K_XL valid ✓ - UD-IQ4_NL → rsplit → ["UD", "IQ4_NL"] → IQ4_NL valid ✓ - Custom-UD-Q4_K → rsplit → ["Custom-UD", "Q4_K"] → Q4_K valid ✓ - RANDOM → no "-" → False - """ - if "-" not in quant_type: - return False - _, remainder = quant_type.rsplit("-", 1) - return is_valid_gguf_quant_type(remainder) - - -# Common suffixes used in GGUF file naming conventions -# e.g., Q4_K_M, Q3_K_S, Q5_K_L, Q2_K_XL -_GGUF_QUANT_SUFFIXES = ("_M", "_S", "_L", "_XL", "_XS", "_XXS") - - -def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool: - """Check if the quant type is a valid GGUF quant type. - - Supports both exact GGML quant types (e.g., Q4_K, IQ1_S) and - extended naming conventions (e.g., Q4_K_M, Q3_K_S, Q5_K_L). - """ - # Check for exact match first - if getattr(GGMLQuantizationType, gguf_quant_type, None) is not None: - return True - - # Check for extended naming conventions (e.g., Q4_K_M -> Q4_K) - for suffix in _GGUF_QUANT_SUFFIXES: - if gguf_quant_type.endswith(suffix): - base_type = gguf_quant_type[: -len(suffix)] - if getattr(GGMLQuantizationType, base_type, None) is not None: - return True - - return False - - -def split_remote_gguf(model: str | Path) -> tuple[str, str]: - """Split the model into repo_id and quant type.""" - model = str(model) - if is_remote_gguf(model): - parts = model.rsplit(":", 1) - return (parts[0], parts[1]) - raise ValueError( - f"Wrong GGUF model or invalid GGUF quant type: {model}.\n" - "- It should be in repo_id:quant_type format.\n" - f"- Valid base quant types: {GGMLQuantizationType._member_names_}\n" - f"- Extended suffixes also supported: {_GGUF_QUANT_SUFFIXES}\n" - "- Non-standard GGUF quant types also supported: " - "dash-separated prefixes (e.g. UD-Q4_K_XL, Custom-Q8_0)", - ) - - -def is_gguf(model: str | Path) -> bool: - """Check if the model is a GGUF model. - - Args: - model: Model name, path, or Path object to check. - - Returns: - True if the model is a GGUF model, False otherwise. - """ - model = str(model) - - # Check if it's a local GGUF file - if check_gguf_file(model): - return True - - # Check if it's a remote GGUF model (repo_id:quant_type format) - return is_remote_gguf(model) - - -def detect_gguf_multimodal(model: str) -> Path | None: - """Check if GGUF model has multimodal projector file. - - Args: - model: Model path string - - Returns: - Path to mmproj file if found, None otherwise - """ - if not model.endswith(".gguf"): - return None - - try: - model_path = Path(model) - if not model_path.is_file(): - return None - - model_dir = model_path.parent - mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"] - for pattern in mmproj_patterns: - mmproj_files = list(model_dir.glob(pattern)) - if mmproj_files: - return mmproj_files[0] - return None - except Exception: - return None - - -def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None": - """Extract vision config parameters from mmproj.gguf metadata. - - Reads vision encoder configuration from GGUF metadata fields using - standardized GGUF constants. Automatically detects the projector type - (e.g., gemma3, llama4) and applies model-specific parameters accordingly. - - The function extracts standard CLIP vision parameters from GGUF metadata - and applies projector-type-specific customizations. For unknown projector - types, it uses safe defaults from SiglipVisionConfig. - - Args: - mmproj_path: Path to mmproj.gguf file (str or Path) - - Returns: - SiglipVisionConfig if extraction succeeds, None if any required - field is missing from the GGUF metadata - - Raises: - Exception: Exceptions from GGUF reading (file not found, corrupted - file, etc.) propagate directly from gguf.GGUFReader - """ - reader = gguf.GGUFReader(str(mmproj_path)) - - # Detect projector type to apply model-specific parameters - projector_type = None - projector_type_field = reader.get_field(Keys.Clip.PROJECTOR_TYPE) - if projector_type_field: - try: - projector_type = bytes(projector_type_field.parts[-1]).decode("utf-8") - except (AttributeError, UnicodeDecodeError) as e: - logger.warning("Failed to decode projector type from GGUF: %s", e) - - # Map GGUF field constants to SiglipVisionConfig parameters. - # Uses official GGUF constants from gguf-py for standardization. - # Format: {gguf_constant: (param_name, dtype)} - VISION_CONFIG_FIELDS = { - Keys.ClipVision.EMBEDDING_LENGTH: ("hidden_size", int), - Keys.ClipVision.FEED_FORWARD_LENGTH: ("intermediate_size", int), - Keys.ClipVision.BLOCK_COUNT: ("num_hidden_layers", int), - Keys.ClipVision.Attention.HEAD_COUNT: ("num_attention_heads", int), - Keys.ClipVision.IMAGE_SIZE: ("image_size", int), - Keys.ClipVision.PATCH_SIZE: ("patch_size", int), - Keys.ClipVision.Attention.LAYERNORM_EPS: ("layer_norm_eps", float), - } - - # Extract and validate all required fields - config_params = {} - for gguf_key, (param_name, dtype) in VISION_CONFIG_FIELDS.items(): - field = reader.get_field(gguf_key) - if field is None: - logger.warning( - "Missing required vision config field '%s' in mmproj.gguf", - gguf_key, - ) - return None - # Extract scalar value from GGUF field and convert to target type - config_params[param_name] = dtype(field.parts[-1]) - - # Apply model-specific parameters based on projector type - if projector_type == VisionProjectorType.GEMMA3: - # Gemma3 doesn't use the vision pooling head (multihead attention) - # This is a vLLM-specific parameter used in SiglipVisionTransformer - config_params["vision_use_head"] = False - logger.info("Detected Gemma3 projector, disabling vision pooling head") - # Add other projector-type-specific customizations here as needed - # elif projector_type == VisionProjectorType.LLAMA4: - # config_params["vision_use_head"] = ... - - # Create config with extracted parameters - # Note: num_channels and attention_dropout use SiglipVisionConfig defaults - # (3 and 0.0 respectively) which are correct for all models - config = SiglipVisionConfig(**config_params) - - if projector_type: - logger.info( - "Extracted vision config from mmproj.gguf (projector_type: %s)", - projector_type, - ) - else: - logger.info("Extracted vision config from mmproj.gguf metadata") - - return config - - -def maybe_patch_hf_config_from_gguf( - model: str, - hf_config: PretrainedConfig, -) -> PretrainedConfig: - """Patch HF config for GGUF models. - - Applies GGUF-specific patches to HuggingFace config: - 1. For multimodal models: patches architecture and vision config - 2. For all GGUF models: overrides vocab_size from embedding tensor - - This ensures compatibility with GGUF models that have extended - vocabularies (e.g., Unsloth) where the GGUF file contains more - tokens than the HuggingFace tokenizer config specifies. - - Args: - model: Model path string - hf_config: HuggingFace config to patch in-place - - Returns: - Updated HuggingFace config - """ - # Patch multimodal config if mmproj.gguf exists - mmproj_path = detect_gguf_multimodal(model) - if mmproj_path is not None: - vision_config = extract_vision_config_from_gguf(str(mmproj_path)) - - # Create HF config for Gemma3 multimodal - text_config = hf_config.get_text_config() - is_gemma3 = hf_config.model_type in ("gemma3", "gemma3_text") - if vision_config is not None and is_gemma3: - new_hf_config = Gemma3Config( - text_config=text_config, - vision_config=vision_config, - architectures=["Gemma3ForConditionalGeneration"], - ) - hf_config = new_hf_config - - return hf_config - - -def get_gguf_file_path_from_hf( - repo_id: str | Path, - quant_type: str, - revision: str | None = None, -) -> str: - """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type. - - Args: - repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B") - quant_type: The quantization type (e.g., "Q4_K_M", "F16") - revision: Optional revision/branch name - - Returns: - The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"), - """ - repo_id = str(repo_id) - gguf_patterns = [ - f"*-{quant_type}.gguf", - f"*-{quant_type}-*.gguf", - f"*/*-{quant_type}.gguf", - f"*/*-{quant_type}-*.gguf", - ] - matching_files = list_filtered_repo_files( - repo_id, - allow_patterns=gguf_patterns, - revision=revision, - ) - - if len(matching_files) == 0: - raise ValueError( - "Could not find GGUF file for repo %s with quantization %s.", - repo_id, - quant_type, - ) - - # Sort to ensure consistent ordering (prefer non-sharded files) - matching_files.sort(key=lambda x: (x.count("-"), x)) - gguf_filename = matching_files[0] - return gguf_filename diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 0e241f6abfd1..ed9d764cdc43 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -24,8 +24,8 @@ from typing_extensions import TypeVar from vllm.logger import init_logger +from vllm.model_format import get_model_format_handler from vllm.transformers_utils import processors -from vllm.transformers_utils.gguf_utils import is_gguf from vllm.transformers_utils.repo_utils import get_hf_file_to_dict from vllm.transformers_utils.utils import convert_model_repo_to_path from vllm.utils.func_utils import get_allowed_kwarg_only_overrides @@ -341,13 +341,8 @@ def cached_processor_from_config( processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin, **kwargs: Any, ) -> _P: - if is_gguf(model_config.model): - assert not is_gguf(model_config.tokenizer), ( - "For multimodal GGUF models, the original tokenizer " - "should be used to correctly load processor." - ) - model = model_config.tokenizer - revision = model_config.tokenizer_revision + if handler := get_model_format_handler(model_config.model): + model, revision = handler.resolve_processor_source(model_config, "processor") else: model = model_config.model revision = model_config.revision @@ -455,13 +450,10 @@ def cached_image_processor_from_config( model_config: "ModelConfig", **kwargs: Any, ): - if is_gguf(model_config.model): - assert not is_gguf(model_config.tokenizer), ( - "For multimodal GGUF models, the original tokenizer " - "should be used to correctly load image processor." + if handler := get_model_format_handler(model_config.model): + model, revision = handler.resolve_processor_source( + model_config, "image_processor" ) - model = model_config.tokenizer - revision = model_config.tokenizer_revision else: model = model_config.model revision = model_config.revision diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py index 91629cb57816..d76a54aae377 100644 --- a/vllm/v1/metrics/perf.py +++ b/vllm/v1/metrics/perf.py @@ -66,7 +66,6 @@ class InvalidComponent(Exception): "bitsandbytes": 0.5, "modelopt_fp4": 0.5, "petit_nvfp4": 0.5, - "gguf": 0.5, "compressed-tensors": 0.5, "torchao": 0.5, "quark": 0.5, From 642e1ecc99fc5869ef44b7f21daac8cc2c86d535 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 12 Apr 2026 14:44:11 +0800 Subject: [PATCH 02/21] remove gguf materialization Signed-off-by: Isotr0py --- vllm/model_executor/layers/linear.py | 103 +-------------------------- 1 file changed, 1 insertion(+), 102 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 297a10bd3942..e6da811b3885 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -5,7 +5,7 @@ from abc import abstractmethod import torch -from torch.nn.parameter import Parameter, UninitializedParameter +from torch.nn.parameter import Parameter import vllm.envs as envs from vllm.distributed import ( @@ -358,20 +358,6 @@ def __init__( self.register_parameter("bias", None) def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): - # If the weight on disk does not have a shape, give it one - # (such scales for AutoFp8). - needs_custom_weight_materialization = getattr( - param, "needs_custom_weight_materialization", False - ) - needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) - if needs_custom_weight_type: - param.weight_type = loaded_weight.item() - - if needs_custom_weight_materialization and isinstance( - param, UninitializedParameter - ): - param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) - if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) @@ -535,22 +521,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # no need to narrow is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit - needs_custom_weight_materialization = getattr( - param, "needs_custom_weight_materialization", False - ) - needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) - if needs_custom_weight_type: - param.weight_type = loaded_weight.item() - - if needs_custom_weight_materialization and isinstance( - param, UninitializedParameter - ): - final_shape = list(loaded_weight.shape) - if output_dim is not None: - assert final_shape[output_dim] % self.tp_size == 0 - final_shape[output_dim] = final_shape[output_dim] // self.tp_size - param.materialize(final_shape, dtype=loaded_weight.dtype) - param_data = param.data if output_dim is not None and not is_sharded_weight: shard_size = param_data.shape[output_dim] @@ -695,38 +665,6 @@ def weight_loader( loaded_shard_id: tuple[int, ...] | int | None = None, ): self.validate_shard_id(loaded_shard_id) - needs_custom_weight_materialization = getattr( - param, "needs_custom_weight_materialization", False - ) - needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) - if isinstance(loaded_shard_id, tuple) and ( - needs_custom_weight_materialization or needs_custom_weight_type - ): - raise NotImplementedError( - "Shard id with multiple indices is not supported for this " - "format-specific weight loader." - ) - if needs_custom_weight_type: - if loaded_shard_id is not None: - param.data[loaded_shard_id].copy_(loaded_weight) - param.shard_weight_type[loaded_shard_id] = loaded_weight.item() - else: - param.shard_weight_type = { - i: loaded_weight.item() for i, _ in enumerate(self.output_sizes) - } - return - - if needs_custom_weight_materialization: - output_dim = getattr(param, "output_dim", None) - shard_size = loaded_weight.size(output_dim) // self.tp_size - start_idx = self.tp_rank * shard_size - - if loaded_shard_id is not None: - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - param.shard_id.append(loaded_shard_id) - param.shard_id_map[loaded_shard_id] = len(param.data_container) - param.data_container.append(loaded_weight) - return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -1172,30 +1110,6 @@ def weight_loader( loaded_shard_id: str | None = None, ): self.validate_shard_id(loaded_shard_id) - needs_custom_weight_materialization = getattr( - param, "needs_custom_weight_materialization", False - ) - needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) - if needs_custom_weight_type: - idx_map = {"q": 0, "k": 1, "v": 2} - if loaded_shard_id is not None: - param.data[idx_map[loaded_shard_id]].copy_(loaded_weight) - param.shard_weight_type[loaded_shard_id] = loaded_weight.item() - else: - param.shard_weight_type = {k: loaded_weight.item() for k in idx_map} - return - - if needs_custom_weight_materialization: - output_dim = getattr(param, "output_dim", None) - shard_size = loaded_weight.size(output_dim) // self.tp_size - start_idx = self.tp_rank * shard_size - - if loaded_shard_id is not None: - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - param.shard_id.append(loaded_shard_id) - param.shard_id_map[loaded_shard_id] = len(param.data_container) - param.data_container.append(loaded_weight) - return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -1484,21 +1398,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # no need to narrow is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit - needs_custom_weight_materialization = getattr( - param, "needs_custom_weight_materialization", False - ) - needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) - if needs_custom_weight_type: - param.weight_type = loaded_weight.item() - - if needs_custom_weight_materialization and isinstance( - param, UninitializedParameter - ): - weight_shape = list(loaded_weight.shape) - if input_dim: - weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size - param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) - param_data = param.data if input_dim is not None and not is_sharded_weight: shard_size = param_data.shape[input_dim] From 1b53ba7b2827ec51c514bea6e61c065a8b98fc21 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 12 Apr 2026 14:44:32 +0800 Subject: [PATCH 03/21] clean Signed-off-by: Isotr0py --- vllm/tokenizers/registry.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index c4ee1ad9a310..7a963d570d9b 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -10,7 +10,6 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.model_format import get_model_format_handler from vllm.transformers_utils.repo_utils import ( any_pattern_in_repo_files, is_mistral_model_repo, @@ -109,16 +108,6 @@ def resolve_tokenizer_args( ) tokenizer_name = tokenizer_path - if handler := get_model_format_handler(tokenizer_name): - tokenizer_name, args, kwargs = handler.resolve_tokenizer_init( - tokenizer_name, - *args, - revision=revision, - runner_type=runner_type, - tokenizer_mode=tokenizer_mode, - **kwargs, - ) - if "truncation_side" not in kwargs: if runner_type == "generate" or runner_type == "draft": kwargs["truncation_side"] = "left" From 6332347efc85eccbc50834a26b37d2f76c72f04b Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 12 Apr 2026 14:59:35 +0800 Subject: [PATCH 04/21] remove gguf materialization Signed-off-by: Isotr0py --- .../layers/vocab_parallel_embedding.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 24c02a49448f..955f2ae1d30a 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -6,7 +6,7 @@ import torch import torch.nn.functional as F -from torch.nn.parameter import Parameter, UninitializedParameter +from torch.nn.parameter import Parameter from vllm.distributed import ( divide, @@ -418,16 +418,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): output_dim = getattr(param, "output_dim", None) packed_dim = getattr(param, "packed_dim", None) - if getattr(param, "needs_custom_weight_type", None): - param.data.copy_(loaded_weight) - param.weight_type = loaded_weight.item() - return - elif isinstance(param, UninitializedParameter): - shape = list(loaded_weight.shape) - if output_dim is not None: - shape[output_dim] = self.num_embeddings_per_partition - param.materialize(tuple(shape), dtype=loaded_weight.dtype) - # If parameter does not have output dim, then it should # be copied onto all gpus (e.g. g_idx for act_order gptq). if output_dim is None: @@ -548,11 +538,8 @@ def __init__( def tie_weights(self, embed_tokens: VocabParallelEmbedding): """Tie the weights with word embeddings.""" - if self.quant_config and self.quant_config.should_keep_tied_lm_head(): - return embed_tokens - else: - self.weight = embed_tokens.weight - return self + self.weight = embed_tokens.weight + return self def forward(self, input_): del input_ From a270d85f7e1557f3ccb04780e6728154a3e15909 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 13 Apr 2026 01:14:27 +0800 Subject: [PATCH 05/21] clean Signed-off-by: Isotr0py --- vllm/config/model.py | 13 +-- vllm/engine/arg_utils.py | 4 - vllm/model_format.py | 162 --------------------------- vllm/transformers_utils/config.py | 67 ++--------- vllm/transformers_utils/processor.py | 22 +--- 5 files changed, 16 insertions(+), 252 deletions(-) delete mode 100644 vllm/model_format.py diff --git a/vllm/config/model.py b/vllm/config/model.py index 054f14a26fef..1a47f4e7266b 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -25,7 +25,6 @@ from vllm.config.scheduler import RunnerType from vllm.config.utils import config, getattr_iter from vllm.logger import init_logger -from vllm.model_format import get_model_format_handler from vllm.platforms import current_platform from vllm.tasks import PoolingTask, ScoreType, SupportedTask from vllm.transformers_utils.config import ( @@ -498,9 +497,6 @@ def __post_init__( hf_overrides_fn=hf_overrides_fn, token=self.hf_token, ) - if handler := get_model_format_handler(self.model): - hf_config = handler.patch_model_hf_config(self.model, hf_config) - self.hf_config = hf_config if dict_overrides: self._apply_dict_overrides(hf_config, dict_overrides) @@ -658,10 +654,6 @@ def __post_init__( "disable the cache with --mm-processor-cache-gb 0." ) - # Multimodal GGUF models must use original repo for mm processing - if handler := get_model_format_handler(self.model): - handler.validate_model_config(self) - if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len # can be correctly capped to sliding window size @@ -814,10 +806,7 @@ def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> No self.tokenizer = object_storage_tokenizer.dir def _get_encoder_config(self) -> dict[str, Any] | None: - model = self.model - if handler := get_model_format_handler(model): - model = handler.resolve_sentence_transformer_source(model, self.revision) - return get_sentence_transformer_tokenizer_config(model, self.revision) + return get_sentence_transformer_tokenizer_config(self.model, self.revision) def _get_default_runner_type( self, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b42904367883..69500ec051cf 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -93,7 +93,6 @@ from vllm.config.utils import get_field from vllm.config.vllm import OptimizationLevel, PerformanceMode from vllm.logger import init_logger, suppress_logging -from vllm.model_format import get_model_format_handler from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized @@ -1418,9 +1417,6 @@ def from_cli_args(cls, args: argparse.Namespace): def create_model_config(self) -> ModelConfig: load_general_plugins() - if handler := get_model_format_handler(self.model): - handler.update_engine_args(self) - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: logger.warning( "The global random seed is set to %d. Since " diff --git a/vllm/model_format.py b/vllm/model_format.py deleted file mode 100644 index cb77983a9470..000000000000 --- a/vllm/model_format.py +++ /dev/null @@ -1,162 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from pathlib import Path -from typing import TYPE_CHECKING, Any - -from vllm.logger import init_logger - -if TYPE_CHECKING: - from transformers import PretrainedConfig - -logger = init_logger(__name__) - - -class ModelFormatHandler: - """Extension hook for out-of-tree model formats. - - Handlers can customize how a model reference is interpreted across vLLM, - such as model/config discovery, tokenizer and processor resolution, and - engine-arg defaults. - """ - - name: str = "" - - def matches(self, model: str | Path | None) -> bool: - return False - - def update_engine_args(self, engine_args: Any) -> None: - return - - def prepare_hf_config_load( - self, - model: str | Path, - revision: str | None = None, - kwargs: dict[str, Any] | None = None, - ) -> tuple[str | Path, dict[str, Any]]: - return model, kwargs or {} - - def should_use_hf_config_parser( - self, - original_model: str | Path, - resolved_model: str | Path, - ) -> bool: - return False - - def get_missing_hf_config_error( - self, - original_model: str | Path, - resolved_model: str | Path, - ) -> str | None: - return None - - def patch_parsed_hf_config( - self, - original_model: str | Path, - config_dict: dict[str, Any], - config: "PretrainedConfig", - ) -> "PretrainedConfig": - return config - - def patch_model_hf_config( - self, - original_model: str | Path, - hf_config: "PretrainedConfig", - ) -> "PretrainedConfig": - return hf_config - - def resolve_tokenizer_init( - self, - tokenizer_name: str | Path, - *args: Any, - revision: str | None = None, - runner_type: str = "generate", - tokenizer_mode: str = "auto", - **kwargs: Any, - ) -> tuple[str | Path, tuple[Any, ...], dict[str, Any]]: - return tokenizer_name, args, kwargs - - def resolve_processor_source( - self, - model_config: Any, - component: str, - ) -> tuple[str | Path, str | None]: - return model_config.model, model_config.revision - - def validate_model_config(self, model_config: Any) -> None: - return - - def resolve_sentence_transformer_source( - self, - model: str | Path, - revision: str | None = None, - ) -> str | Path: - return model - - def resolve_image_processor_source( - self, - model: str | Path, - revision: str | None = None, - ) -> str | Path: - return model - - def should_skip_generation_config(self, model: str | Path) -> bool: - return False - - -_MODEL_FORMAT_HANDLERS: list[ModelFormatHandler] = [] - - -def register_model_format(handler: ModelFormatHandler) -> ModelFormatHandler: - if not isinstance(handler, ModelFormatHandler): - raise ValueError("The model format handler must subclass `ModelFormatHandler`.") - - replaced = False - if handler.name: - for idx, existing in enumerate(_MODEL_FORMAT_HANDLERS): - if existing.name == handler.name: - logger.warning( - "The model format handler %r already exists and will be " - "overwritten by %s.", - handler.name, - type(handler), - ) - _MODEL_FORMAT_HANDLERS[idx] = handler - replaced = True - break - - if not replaced: - _MODEL_FORMAT_HANDLERS.append(handler) - - return handler - - -def get_model_format_handler(model: str | Path | None) -> ModelFormatHandler | None: - for handler in reversed(_MODEL_FORMAT_HANDLERS): - if handler.matches(model): - return handler - return None - - -def prepare_hf_model_reference( - model: str | Path, - revision: str | None = None, - **kwargs: Any, -) -> tuple[ModelFormatHandler | None, str | Path, dict[str, Any]]: - handler = get_model_format_handler(model) - if handler is None: - return None, model, kwargs - resolved_model, resolved_kwargs = handler.prepare_hf_config_load( - model, - revision=revision, - kwargs=kwargs, - ) - return handler, resolved_model, resolved_kwargs - - -__all__ = [ - "ModelFormatHandler", - "get_model_format_handler", - "prepare_hf_model_reference", - "register_model_format", -] diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index fa8beacfebcf..3a6fb93fa1f9 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -25,10 +25,6 @@ from vllm import envs from vllm.logger import init_logger -from vllm.model_format import ( - get_model_format_handler, - prepare_hf_model_reference, -) from vllm.transformers_utils.repo_utils import is_mistral_model_repo from vllm.transformers_utils.utils import ( parse_safetensors_file_metadata, @@ -571,17 +567,15 @@ def maybe_override_with_speculators( Tuple of (resolved_model, resolved_tokenizer, speculative_config) """ kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE - _, resolved_model, resolved_kwargs = prepare_hf_model_reference( - model, - revision=revision, - **kwargs, - ) - config_dict, _ = PretrainedConfig.get_config_dict( - resolved_model, - revision=revision, - token=hf_token, - **without_trust_remote_code(resolved_kwargs), - ) + try: + config_dict, _ = PretrainedConfig.get_config_dict( + model, + revision=revision, + token=hf_token, + **without_trust_remote_code(kwargs), + ) + except OSError: + config_dict = {} speculators_config = config_dict.get("speculators_config") if speculators_config is None: @@ -615,20 +609,8 @@ def get_config( hf_overrides_fn: Callable[[PretrainedConfig], PretrainedConfig] | None = None, **kwargs, ) -> PretrainedConfig: - original_model = model - handler, model, kwargs = prepare_hf_model_reference( - model, - revision=revision, - **kwargs, - ) - if config_format == "auto": try: - use_hf_parser_without_config = ( - handler.should_use_hf_config_parser(original_model, model) - if handler is not None - else False - ) # First check for Mistral to avoid defaulting to # Transformers implementation. if is_mistral_model_repo( @@ -637,22 +619,9 @@ def get_config( model=model, config_name=MISTRAL_CONFIG_NAME, revision=revision ): config_format = "mistral" - elif use_hf_parser_without_config or file_or_path_exists( - model, HF_CONFIG_NAME, revision=revision - ): + elif file_or_path_exists(model, HF_CONFIG_NAME, revision=revision): config_format = "hf" else: - if ( - handler is not None - and ( - err_msg := handler.get_missing_hf_config_error( - original_model, model - ) - ) - is not None - ): - logger.error(err_msg) - raise ValueError(err_msg) raise ValueError( "Could not detect config format for no config file found. " "With config_format 'auto', ensure your model has either " @@ -672,7 +641,7 @@ def get_config( "'config.json'.\n" " - For Mistral models: ensure the presence of a " "'params.json'.\n" - ).format(model=original_model) + ).format(model=model) raise ValueError(error_message) from e @@ -685,8 +654,6 @@ def get_config( hf_overrides=hf_overrides_kw or hf_overrides_fn, **kwargs, ) - if handler is not None: - config = handler.patch_parsed_hf_config(original_model, config_dict, config) # Architecture mapping for models without explicit architectures field if not config.architectures: @@ -779,9 +746,6 @@ def get_pooling_config( A dictionary containing the pooling type and whether normalization is used, or None if no pooling configuration is found. """ - if handler := get_model_format_handler(model): - model = handler.resolve_sentence_transformer_source(model, revision) - modules_file_name = "modules.json" modules_dict = None @@ -869,9 +833,6 @@ def get_sentence_transformer_tokenizer_config( - dict: A dictionary containing the configuration parameters for the Sentence Transformer BERT model. """ - if handler := get_model_format_handler(model): - model = handler.resolve_sentence_transformer_source(model, revision) - sentence_transformer_config_files = [ "sentence_bert_config.json", "sentence_roberta_config.json", @@ -1000,8 +961,6 @@ def get_hf_image_processor_config( # ModelScope does not provide an interface for image_processor if envs.VLLM_USE_MODELSCOPE: return dict() - if handler := get_model_format_handler(model): - model = handler.resolve_image_processor_source(model, revision) return get_image_processor_config( model, token=hf_token, revision=revision, **kwargs ) @@ -1031,10 +990,6 @@ def try_get_generation_config( config_format: str | ConfigFormat = "auto", hf_token: bool | str | None = None, ) -> GenerationConfig | None: - if handler := get_model_format_handler(model): - if handler.should_skip_generation_config(model): - return None - try: return GenerationConfig.from_pretrained( model, diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index ed9d764cdc43..bf432fe8f1e0 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -24,7 +24,6 @@ from typing_extensions import TypeVar from vllm.logger import init_logger -from vllm.model_format import get_model_format_handler from vllm.transformers_utils import processors from vllm.transformers_utils.repo_utils import get_hf_file_to_dict from vllm.transformers_utils.utils import convert_model_repo_to_path @@ -341,15 +340,9 @@ def cached_processor_from_config( processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin, **kwargs: Any, ) -> _P: - if handler := get_model_format_handler(model_config.model): - model, revision = handler.resolve_processor_source(model_config, "processor") - else: - model = model_config.model - revision = model_config.revision - return cached_get_processor_without_dynamic_kwargs( - model, - revision=revision, + model_config.model, + revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, processor_cls=processor_cls, # type: ignore[arg-type] **_merge_mm_kwargs(model_config, processor_cls, **kwargs), @@ -450,16 +443,9 @@ def cached_image_processor_from_config( model_config: "ModelConfig", **kwargs: Any, ): - if handler := get_model_format_handler(model_config.model): - model, revision = handler.resolve_processor_source( - model_config, "image_processor" - ) - else: - model = model_config.model - revision = model_config.revision return cached_get_image_processor( - model, - revision=revision, + model_config.model, + revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs), ) From 4d940cb9791c556b3385ce1ac4b85a568d5ea074 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 7 May 2026 22:25:37 +0800 Subject: [PATCH 06/21] workaround tie words embedding Signed-off-by: Isotr0py --- vllm/model_executor/layers/vocab_parallel_embedding.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index ae8243d314bd..3321010ac6a9 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -77,6 +77,12 @@ def apply( def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor: return F.embedding(input_, layer.weight) + def tie_weights( + self, layer: torch.nn.Module, embed_tokens: "VocabParallelEmbedding" + ): + layer.weight = embed_tokens.weight + return layer + def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: """Pad the vocab size to the given value.""" @@ -544,8 +550,7 @@ def __init__( def tie_weights(self, embed_tokens: VocabParallelEmbedding): """Tie the weights with word embeddings.""" - self.weight = embed_tokens.weight - return self + return self.quant_method.tie_weights(self, embed_tokens) def forward(self, input_): del input_ From 83fe14d0ec812a2b2851dfa7f51008f9bb4fb476 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 8 May 2026 23:25:19 +0800 Subject: [PATCH 07/21] remove siglip maybe_swap_ffn_param Signed-off-by: Isotr0py --- vllm/model_executor/models/siglip.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index c50d737878f5..bc6302068a73 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -952,27 +952,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: break else: param = params_dict[name] - param = maybe_swap_ffn_param( - name, param, loaded_weight, params_dict, self.quant_config - ) weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params -def maybe_swap_ffn_param( - name: str, - param: torch.Tensor, - loaded_weight: torch.Tensor, - params_dict: dict[str, torch.Tensor], - quant_config: QuantizationConfig, -) -> torch.Tensor: - if quant_config is None or ".fc" not in name: - return param - return quant_config.remap_loaded_parameter(name, param, loaded_weight, params_dict) - - # Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200 class SiglipTextEmbeddings(nn.Module): def __init__(self, config: SiglipTextConfig): From df565fa298414fb737e6f9024634ee1293d591f1 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 May 2026 14:57:32 +0800 Subject: [PATCH 08/21] pass quant_config Signed-off-by: Isotr0py --- vllm/model_executor/models/olmoe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 1f342ad1733d..5b661aa4e4de 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -279,12 +279,14 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.vocab_size = config.vocab_size self.config = config self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, From ca76814652a4f0c50d0956401c9c0312acd658ab Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 May 2026 15:23:44 +0800 Subject: [PATCH 09/21] clean Signed-off-by: Isotr0py --- vllm/model_executor/models/apertus.py | 4 ---- vllm/model_executor/models/exaone.py | 4 ---- vllm/model_executor/models/exaone4.py | 4 ---- vllm/model_executor/models/jais2.py | 5 ----- vllm/model_executor/models/llama.py | 4 ---- vllm/model_executor/models/llama4.py | 4 ---- vllm/model_executor/models/openpangu.py | 5 ----- 7 files changed, 30 deletions(-) diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 7254f41fd2f0..dcea424200d0 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -228,10 +228,6 @@ def _init_rotary_emb( quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True - if quant_config is not None: - override = quant_config.override_is_neox_style(config.model_type) - if override is not None: - is_neox_style = override self.rotary_emb = get_rope( self.head_dim, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 4a56aa676073..f80a5b34c14f 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -162,10 +162,6 @@ def __init__( ) is_neox_style = True - if quant_config is not None: - override = quant_config.override_is_neox_style(config.model_type) - if override is not None: - is_neox_style = override self.rotary_emb = get_rope( self.head_dim, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 51b317a51229..81bd79ca8a7e 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -168,10 +168,6 @@ def __init__( self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) is_neox_style = True - if quant_config is not None: - override = quant_config.override_is_neox_style(config.model_type) - if override is not None: - is_neox_style = override layer_idx = extract_layer_index(prefix) is_sliding = config.layer_types[layer_idx] == "sliding_attention" diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 0f3ec5315847..f4303c4010e4 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -161,11 +161,6 @@ def __init__( ) is_neox_style = True - if quant_config is not None: - override = quant_config.override_is_neox_style(config.model_type) - if override is not None: - is_neox_style = override - self.rotary_emb = get_rope( self.head_dim, max_position=max_position_embeddings, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 93607c967af4..ef704e8c0d95 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -238,10 +238,6 @@ def _init_rotary_emb( quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True - if quant_config is not None: - override = quant_config.override_is_neox_style(config.model_type) - if override is not None: - is_neox_style = override self.rotary_emb = get_rope( self.head_dim, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 349ede83f1a3..fc4e95be0406 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -237,10 +237,6 @@ def __init__( prefix=f"{prefix}.o_proj", ) is_neox_style = True - if quant_config is not None: - override = quant_config.override_is_neox_style(config.model_type) - if override is not None: - is_neox_style = override self.rotary_emb = ( get_rope( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 29688b5af228..783655a08d97 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -517,11 +517,6 @@ def _init_rotary_emb( quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True - if quant_config is not None: - override = quant_config.override_is_neox_style(config.model_type) - if override is not None: - is_neox_style = override - rope_parameters = config.rope_parameters or {} if rope_parameters is not None and rope_parameters.get( "mrope_interleaved", False From 4b1e2d0d8a17081cee5e07ecd8c01a5db3d17481 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 May 2026 15:29:14 +0800 Subject: [PATCH 10/21] clean openpangu Signed-off-by: Isotr0py --- vllm/model_executor/models/openpangu.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 783655a08d97..a517c52e6902 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -712,22 +712,6 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): # no need to narrow is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit - needs_custom_weight_materialization = getattr( - param, "needs_custom_weight_materialization", False - ) - needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) - if needs_custom_weight_type: - param.weight_type = loaded_weight.item() - - if needs_custom_weight_materialization and isinstance( - param, nn.UninitializedParameter - ): - final_shape = list(loaded_weight.shape) - if output_dim is not None: - assert final_shape[output_dim] % self.tp_size == 0 - final_shape[output_dim] = final_shape[output_dim] // self.tp_size - param.materialize(final_shape, dtype=loaded_weight.dtype) - param_data = param.data if output_dim is not None and not is_sharded_weight: shard_size = param_data.shape[output_dim] From 070ce481184b6bb12348f38ee6919e6e433b35cc Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 May 2026 15:31:55 +0800 Subject: [PATCH 11/21] clean dead code Signed-off-by: Isotr0py --- .../layers/quantization/base_config.py | 22 ------------------- vllm/model_executor/models/gemma3.py | 5 ----- 2 files changed, 27 deletions(-) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index b870bbd94b39..8f70f3e7a0b2 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -169,28 +169,6 @@ def get_quant_method( def get_cache_scale(self, name: str) -> str | None: return None - def override_is_neox_style(self, model_type: str) -> bool | None: - return None - - def should_keep_tied_lm_head(self) -> bool: - return False - - def transform_loaded_weight( - self, - name: str, - loaded_weight: torch.Tensor, - ) -> torch.Tensor: - return loaded_weight - - def remap_loaded_parameter( - self, - name: str, - param: torch.Tensor, - loaded_weight: torch.Tensor, - params_dict: dict[str, torch.Tensor], - ) -> torch.Tensor: - return param - def apply_vllm_mapper( # noqa: B027 self, hf_to_vllm_mapper: "WeightsMapper" ): diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 5b3f0688de4d..72392b8f9ece 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -383,11 +383,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None: - loaded_weight = self.quant_config.transform_loaded_weight( - name, loaded_weight - ) - if self.quant_config is not None and ( scale_name := self.quant_config.get_cache_scale(name) ): From de99b2bab52eda9d93865be99445a22b422fd39c Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 May 2026 17:02:38 +0800 Subject: [PATCH 12/21] clean MoE weight loader Signed-off-by: Isotr0py --- vllm/model_executor/layers/fused_moe/layer.py | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fd54ef8715c5..bbfe5af64277 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -6,7 +6,6 @@ from typing import Literal, cast, get_args, overload import torch -from torch.nn.parameter import UninitializedParameter from vllm._aiter_ops import rocm_aiter_ops from vllm.config import VllmConfig, get_current_vllm_config @@ -1155,15 +1154,6 @@ def weight_loader( # dimension intermediate_size_per_partition is used. SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} - needs_custom_weight_materialization = getattr( - param, "needs_custom_weight_materialization", False - ) - needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False) - if needs_custom_weight_type: - param.weight_type = loaded_weight.item() - param.data.copy_(loaded_weight) - return True if return_success else None - # Case for BitsAndBytes use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) if use_bitsandbytes_4bit: @@ -1209,19 +1199,6 @@ def weight_loader( if full_load: shard_dim += 1 - if needs_custom_weight_materialization and isinstance( - param, UninitializedParameter - ): - # To materialize a tensor, we must have full shape including - # number of experts, making this portion to require `full_load`. - assert full_load - final_shape = list(loaded_weight.shape) - # w1 and w3 are merged per expert. - if shard_id in {"w1", "w3"}: - final_shape[1] *= 2 - final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size - param.materialize(final_shape, dtype=loaded_weight.dtype) - expert_data = param.data if full_load else param.data[expert_id] # Case input scale: input_scale loading is only supported for fp8 From 1c74b1c6333b2e04d745a854ffb9932f84f8ba69 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 May 2026 17:12:37 +0800 Subject: [PATCH 13/21] clean rocm test Signed-off-by: Isotr0py --- requirements/test/rocm.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt index b61d74246452..9d95a9e4375b 100644 --- a/requirements/test/rocm.txt +++ b/requirements/test/rocm.txt @@ -273,12 +273,6 @@ genai-perf==0.0.16 # via -r requirements/test/rocm.in genson==1.3.0 # via datamodel-code-generator -geopandas==1.1.3 - # via terratorch -gitdb==4.0.12 - # via gitpython -gitpython==3.1.46 - # via wandb google-api-core==2.30.0 # via # google-cloud-core @@ -585,8 +579,6 @@ numpy==2.2.6 # evaluate # fastparquet # genai-perf - # geopandas - # h5py # imagehash # imageio # librosa From f38f9bb8a5f26cb13e88bbc104d2f6a5ffed2b8d Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 10 May 2026 23:46:29 +0800 Subject: [PATCH 14/21] clean spec config overrides Signed-off-by: Isotr0py --- vllm/transformers_utils/config.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 955360dd472c..ecf0638ef2cb 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -602,15 +602,12 @@ def maybe_override_with_speculators( Tuple of (resolved_model, resolved_tokenizer, speculative_config) """ kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE - try: - config_dict, _ = PretrainedConfig.get_config_dict( - model, - revision=revision, - token=hf_token, - **without_trust_remote_code(kwargs), - ) - except OSError: - config_dict = {} + config_dict, _ = PretrainedConfig.get_config_dict( + model, + revision=revision, + token=hf_token, + **without_trust_remote_code(kwargs), + ) speculators_config = config_dict.get("speculators_config") if speculators_config is None: From fdf0b53b5aeb19461e7694c4f11d6b7cbf1ae63b Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 11 May 2026 00:08:45 +0800 Subject: [PATCH 15/21] clean unused config patch Signed-off-by: Isotr0py --- vllm/model_executor/layers/quantization/base_config.py | 4 ---- vllm/model_executor/model_loader/weight_utils.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 8f70f3e7a0b2..344ddd8abd25 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -129,10 +129,6 @@ def override_quantization_method( """ return None - @classmethod - def requires_hf_quant_config(cls) -> bool: - return True - @staticmethod def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any: """Get a value from the model's quantization config.""" diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index a19ead0a6b69..e0290a2d4496 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -258,9 +258,6 @@ def get_quant_config( raise ValueError("Model quantization method is not specified in the config.") quant_cls = get_quantization_config(model_config.quantization) - if not quant_cls.requires_hf_quant_config(): - return quant_cls.from_config({}) - # Read the quantization config from the HF model config, if available. hf_quant_config = getattr(model_config.hf_config, "quantization_config", None) # some vision model may keep quantization_config in their text_config From 78f3b92e1ab90ad63fd4c66ef5a2ec0e841ba319 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 11 May 2026 00:22:09 +0800 Subject: [PATCH 16/21] clean unnecessary load_general_plugins Signed-off-by: Isotr0py --- vllm/engine/arg_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fc7df1d5ef11..79a0dbdd3707 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1484,8 +1484,6 @@ def from_cli_args(cls, args: argparse.Namespace): return engine_args def create_model_config(self) -> ModelConfig: - load_general_plugins() - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: logger.warning( "The global random seed is set to %d. Since " @@ -1629,7 +1627,6 @@ def create_engine_config( NOTE: If VllmConfig is incompatible, we raise an error. """ current_platform.pre_register_and_update() - load_general_plugins() device_config = DeviceConfig(device=cast(Device, current_platform.device_type)) From a94499b68f8535b6f72533927bcc64720a8d6e1d Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 14 May 2026 02:18:46 +0800 Subject: [PATCH 17/21] make pre-commit happy Signed-off-by: Isotr0py --- vllm/model_executor/layers/quantization/base_config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 344ddd8abd25..ed32699d2fd9 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -47,6 +47,13 @@ def embedding(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor: Expects create_weights to have been called before on the layer.""" raise NotImplementedError + # Not required functions + def tie_weights(self, layer: torch.nn.Module, *args, **kwargs): + """Tie layer's weights for the layer from another layer/tensors. + + Expects create_weights to have been called before on the layer.""" + raise NotImplementedError + def process_weights_after_loading(self, layer: nn.Module) -> None: """Process the weight after loading. From c321339b11ade7c242ee20ced20e3c509a484e82 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 20 May 2026 15:58:50 +0800 Subject: [PATCH 18/21] add GGUF doc back Signed-off-by: Isotr0py --- docs/features/quantization/gguf.md | 93 ++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 docs/features/quantization/gguf.md diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md new file mode 100644 index 000000000000..0aa76d679e15 --- /dev/null +++ b/docs/features/quantization/gguf.md @@ -0,0 +1,93 @@ +# GGUF + +!!! warning + Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. + +!!! note + GGUF support has migrated to OOT [vllm-gguf-plugin](https://github.com/vllm-project/vllm-gguf-plugin). Make sure you have GGUF plugin installed before serving a GGUF model. + +Before serving a GGUF model, make sure to install the [vllm-gguf-plugin](https://github.com/vllm-project/vllm-gguf-plugin): + +```bash +uv pip install vllm-gguf-plugin +``` + +To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF): + +```bash +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B +``` + +You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: + +```bash +vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \ + --tokenizer Qwen/Qwen3-0.6B \ + --tensor-parallel-size 2 +``` + +Alternatively, you can download and use a local GGUF file: + +```bash +wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf +vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B +``` + +!!! warning + We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. + +GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path + +```bash +# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path +vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \ + --tokenizer Qwen/Qwen3-0.6B \ + --hf-config-path Qwen/Qwen3-0.6B +``` + +You can also use the GGUF model directly through the LLM entrypoint: + +??? code + + ```python + from vllm import LLM, SamplingParams + + # In this script, we demonstrate how to pass input to the chat method: + conversation = [ + { + "role": "system", + "content": "You are a helpful assistant", + }, + { + "role": "user", + "content": "Hello", + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?", + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM using repo_id:quant_type format. + llm = LLM( + model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M", + tokenizer="Qwen/Qwen3-0.6B", + ) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.chat(conversation, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` From 3467f2593d761305234bd40846a9d78279fd7e1e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 20 May 2026 22:22:57 +0800 Subject: [PATCH 19/21] remove gguf kernels again Signed-off-by: Isotr0py --- .../quantization/gguf/gguf_kernel.cu | 560 ------------- .../libtorch_stable/quantization/gguf/moe.cuh | 739 ------------------ .../quantization/gguf/moe_vec.cuh | 338 -------- 3 files changed, 1637 deletions(-) delete mode 100644 csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu delete mode 100644 csrc/libtorch_stable/quantization/gguf/moe.cuh delete mode 100644 csrc/libtorch_stable/quantization/gguf/moe_vec.cuh diff --git a/csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu b/csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu deleted file mode 100644 index 0fdfcafab8c0..000000000000 --- a/csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu +++ /dev/null @@ -1,560 +0,0 @@ -#include -#include - -#include "../../../cuda_compat.h" -#include "../../dispatch_utils.h" -#include "../../torch_utils.h" - -#include - -// NOTE: These headers are intentionally kept in csrc/quantization/gguf/ (not -// moved to libtorch_stable) to avoid unnecessary reformatting that would break -// git rename detection and pollute blame history. -#include "../../../quantization/gguf/ggml-common.h" -#include "../../../quantization/gguf/vecdotq.cuh" -#include "../../../quantization/gguf/dequantize.cuh" -#include "../../../quantization/gguf/mmvq.cuh" -#include "../../../quantization/gguf/mmq.cuh" -#include "moe.cuh" -#include "moe_vec.cuh" - -// Q8 gemv -template -static __global__ void quantize_q8_1(const scalar_t* __restrict__ x, - void* __restrict__ vy, const int kx, - const int kx_padded) { - const auto ix = blockDim.x * blockIdx.x + threadIdx.x; - if (ix >= kx_padded) { - return; - } - const auto iy = blockDim.y * blockIdx.y + threadIdx.y; - const int i_padded = iy * kx_padded + ix; - - block_q8_1* y = (block_q8_1*)vy; - - const int ib = i_padded / QK8_1; // block index - const int iqs = i_padded % QK8_1; // quant index - - const float xi = ix < kx ? static_cast(x[iy * kx + ix]) : 0.0f; - float amax = fabsf(xi); - float sum = xi; - -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32)); - sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32); - } - - const float d = amax / 127; - const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); - - y[ib].qs[iqs] = q; - - if (iqs > 0) { - return; - } - - y[ib].ds.x = __float2half(d); - y[ib].ds.y = __float2half(sum); -} - -template -static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx, - const int ky, cudaStream_t stream) { - const int64_t kx_padded = (kx + 512 - 1) / 512 * 512; - const int block_num_x = - (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - constexpr int MAX_BLOCK_SIZE = 65535; - for (int off = 0; off < ky; off += MAX_BLOCK_SIZE) { - const int num_blocks_y = std::min(ky, off + MAX_BLOCK_SIZE) - off; - const dim3 num_blocks(block_num_x, num_blocks_y, 1); - const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>( - &x[off * kx], (int32_t*)vy + off * (kx_padded / 32 * 9), kx, kx_padded); - } -} - -torch::stable::Tensor ggml_dequantize( - torch::stable::Tensor W, // quant weight - int64_t type, int64_t m, int64_t n, - std::optional const& dtype) { - const torch::stable::accelerator::DeviceGuard device_guard( - W.get_device_index()); - auto dtype_ = dtype.value_or(torch::headeronly::ScalarType::Half); - auto DW = torch::stable::empty({m, n}, dtype_, std::nullopt, W.device()); - cudaStream_t stream = get_current_cuda_stream(); - - VLLM_STABLE_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] { - auto to_cuda = ggml_get_to_cuda(type); - to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream); - }); - - return DW; -} - -torch::stable::Tensor ggml_mul_mat_vec_a8( - torch::stable::Tensor W, // quant weight - torch::stable::Tensor X, // input - int64_t type, int64_t row) { - int col = X.sizes()[1]; - int vecs = X.sizes()[0]; - const int padded = (col + 512 - 1) / 512 * 512; - const torch::stable::accelerator::DeviceGuard device_guard( - X.get_device_index()); - auto Y = torch::stable::empty({vecs, row}, X.scalar_type(), std::nullopt, - W.device()); - cudaStream_t stream = get_current_cuda_stream(); - auto quant_X = torch::stable::empty({vecs, padded / 32 * 9}, - torch::headeronly::ScalarType::Int, - std::nullopt, W.device()); - VLLM_STABLE_DISPATCH_FLOATING_TYPES( - X.scalar_type(), "ggml_mul_mat_vec_a8", [&] { - quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), - (void*)quant_X.data_ptr(), col, vecs, - stream); - switch (type) { - case 2: - mul_mat_vec_q4_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 3: - mul_mat_vec_q4_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 6: - mul_mat_vec_q5_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 7: - mul_mat_vec_q5_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 8: - mul_mat_vec_q8_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 10: - mul_mat_vec_q2_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 11: - mul_mat_vec_q3_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 12: - mul_mat_vec_q4_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 13: - mul_mat_vec_q5_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 14: - mul_mat_vec_q6_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 16: - mul_mat_vec_iq2_xxs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 17: - mul_mat_vec_iq2_xs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 18: - mul_mat_vec_iq3_xxs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 19: - mul_mat_vec_iq1_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 20: - mul_mat_vec_iq4_nl_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 21: - mul_mat_vec_iq3_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 22: - mul_mat_vec_iq2_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 23: - mul_mat_vec_iq4_xs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - case 29: - mul_mat_vec_iq1_m_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, vecs, stream); - break; - } - }); - return Y; -} - -torch::stable::Tensor ggml_mul_mat_a8(torch::stable::Tensor W, // quant weight - torch::stable::Tensor X, // input - int64_t type, int64_t row) { - int col = X.sizes()[1]; - int padded = (col + 512 - 1) / 512 * 512; - int batch = X.sizes()[0]; - const torch::stable::accelerator::DeviceGuard device_guard( - X.get_device_index()); - auto Y = torch::stable::empty({batch, row}, X.scalar_type(), std::nullopt, - W.device()); - cudaStream_t stream = get_current_cuda_stream(); - auto quant_X = torch::stable::empty({batch, padded / 32 * 9}, - torch::headeronly::ScalarType::Int, - std::nullopt, W.device()); - VLLM_STABLE_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_a8", [&] { - quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), - col, batch, stream); - - switch (type) { - case 2: - ggml_mul_mat_q4_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 3: - ggml_mul_mat_q4_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 6: - ggml_mul_mat_q5_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 7: - ggml_mul_mat_q5_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 8: - ggml_mul_mat_q8_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 10: - ggml_mul_mat_q2_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 11: - ggml_mul_mat_q3_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 12: - ggml_mul_mat_q4_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 13: - ggml_mul_mat_q5_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - case 14: - ggml_mul_mat_q6_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream); - break; - } - }); - return Y; -} - -torch::stable::Tensor ggml_moe_a8(torch::stable::Tensor X, // input - torch::stable::Tensor W, // expert weights - torch::stable::Tensor sorted_token_ids, - torch::stable::Tensor expert_ids, - torch::stable::Tensor num_tokens_post_padded, - int64_t type, int64_t row, int64_t top_k, - int64_t tokens) { - int col = X.sizes()[1]; - int padded = (col + 512 - 1) / 512 * 512; - const torch::stable::accelerator::DeviceGuard device_guard( - X.get_device_index()); - auto Y = torch::stable::empty({tokens * top_k, row}, X.scalar_type(), - std::nullopt, W.device()); - cudaStream_t stream = get_current_cuda_stream(); - auto quant_X = torch::stable::empty({tokens, padded / 32 * 9}, - torch::headeronly::ScalarType::Int, - std::nullopt, W.device()); - VLLM_STABLE_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_a8", [&] { - quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), - col, tokens, stream); - switch (type) { - case 2: - ggml_moe_q4_0_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 3: - ggml_moe_q4_1_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 6: - ggml_moe_q5_0_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 7: - ggml_moe_q5_1_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 8: - ggml_moe_q8_0_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 10: - ggml_moe_q2_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 11: - ggml_moe_q3_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 12: - ggml_moe_q4_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 13: - ggml_moe_q5_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - case 14: - ggml_moe_q6_K_q8_1_cuda( - (void*)quant_X.data_ptr(), (void*)W.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(), - (int*)expert_ids.data_ptr(), - (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row, - tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream); - break; - } - }); - return Y; -} - -torch::stable::Tensor ggml_moe_a8_vec( - torch::stable::Tensor X, // input - torch::stable::Tensor W, // expert weights - torch::stable::Tensor topk_ids, int64_t top_k, int64_t type, int64_t row, - int64_t tokens) { - int col = X.sizes()[1]; - const int padded = (col + 512 - 1) / 512 * 512; - const torch::stable::accelerator::DeviceGuard device_guard( - X.get_device_index()); - auto Y = torch::stable::empty({tokens * top_k, row}, X.scalar_type(), - std::nullopt, W.device()); - torch::stable::fill_(Y, 0.0); - cudaStream_t stream = get_current_cuda_stream(); - auto quant_X = torch::stable::empty({tokens, padded / 32 * 9}, - torch::headeronly::ScalarType::Int, - std::nullopt, W.device()); - VLLM_STABLE_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_vec_a8", [&] { - quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), - (void*)quant_X.data_ptr(), col, tokens, - stream); - switch (type) { - case 2: - moe_vec_q4_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 3: - moe_vec_q4_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 6: - moe_vec_q5_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 7: - moe_vec_q5_1_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 8: - moe_vec_q8_0_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 10: - moe_vec_q2_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 11: - moe_vec_q3_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 12: - moe_vec_q4_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 13: - moe_vec_q5_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 14: - moe_vec_q6_K_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 16: - moe_vec_iq2_xxs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 17: - moe_vec_iq2_xs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 18: - moe_vec_iq3_xxs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 19: - moe_vec_iq1_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 20: - moe_vec_iq4_nl_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 21: - moe_vec_iq3_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 22: - moe_vec_iq2_s_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 23: - moe_vec_iq4_xs_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - case 29: - moe_vec_iq1_m_q8_1_cuda( - (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens, - col, row, quant_X.stride(0), stream); - break; - } - }); - return Y; -} - -int64_t ggml_moe_get_block_size(int64_t type) { - switch (type) { - case 2: - return MOE_X_Q4_0; - case 3: - return MOE_X_Q4_1; - case 6: - return MOE_X_Q5_0; - case 7: - return MOE_X_Q5_1; - case 8: - return MOE_X_Q8_0; - case 10: - return MOE_X_Q2_K; - case 11: - return MOE_X_Q3_K; - case 12: - return MOE_X_Q4_K; - case 13: - return MOE_X_Q5_K; - case 14: - return MOE_X_Q6_K; - } - return 0; -} diff --git a/csrc/libtorch_stable/quantization/gguf/moe.cuh b/csrc/libtorch_stable/quantization/gguf/moe.cuh deleted file mode 100644 index a2f9f46c8f89..000000000000 --- a/csrc/libtorch_stable/quantization/gguf/moe.cuh +++ /dev/null @@ -1,739 +0,0 @@ -#include - -/* Adapted from ./csrc/quantization/gguf/mmq.cuh - based on ./vllm/model_executor/layers/fused_moe/experts/triton_moe.py */ -template -static __device__ __forceinline__ void moe_q( - const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* __restrict__ sorted_token_ids, - const int* __restrict__ expert_ids, - const int* __restrict__ num_tokens_post_padded, const int exp_stride, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, - const int nrows_dst, const int top_k) { - const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; - const int blocks_per_warp = WARP_SIZE_GGUF / qi; - - const int ncols_dst = ncols_y * top_k; - - const auto row_dst_0 = blockIdx.x * mmq_y; - const int& row_x_0 = row_dst_0; - - const auto col_dst_0 = blockIdx.y * mmq_x; - - int token_offs[mmq_x / nwarps]; - for (int i = 0; i < mmq_x; i += nwarps) { - token_offs[i / nwarps] = sorted_token_ids[col_dst_0 + threadIdx.y + i]; - } - - const int exp_idx = expert_ids[blockIdx.y]; - if (exp_idx > 255 || exp_idx < 0) return; - if (blockIdx.y * mmq_x > num_tokens_post_padded[0]) return; - - const block_q_t* x = (const block_q_t*)((char*)vx + exp_idx * exp_stride); - const block_q8_1* y = (const block_q8_1*)(vy); - - int* tile_x_ql = nullptr; - half2* tile_x_dm = nullptr; - int* tile_x_qh = nullptr; - int* tile_x_sc = nullptr; - - allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - - __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF]; - __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF / QI8_1]; - - float sum[mmq_y / WARP_SIZE_GGUF][mmq_x / nwarps] = {{0.0f}}; - - for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { - load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, - tile_x_qh, tile_x_sc, threadIdx.y, nrows_x - row_x_0 - 1, - threadIdx.x, blocks_per_row_x); - - const int n_per_r = ((qk * blocks_per_warp) / qr); -#pragma unroll - for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) { - const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x; - const int kbxd = kqs / QI8_1; - -#pragma unroll - for (int i = 0; i < mmq_x; i += nwarps) { - const int col_y_eff = token_offs[i / nwarps] / top_k; - const int block_x = ib0 * (qk / QK8_1) + kbxd; - if (col_y_eff < ncols_y && block_x < blocks_per_col_y) { - const block_q8_1* by0 = &y[col_y_eff * blocks_per_col_y + block_x]; - const int index_y = - (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF; - tile_y_qs[index_y] = - get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); - } - } - - if (threadIdx.x < n_per_r / QK8_1) { - const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1); - const int col_y_eff = token_offs[threadIdx.y] / top_k; - const int block_x = - ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby; - - if (col_y_eff < ncols_y && block_x < blocks_per_col_y) { - const half2* dsi_src = &y[col_y_eff * blocks_per_col_y + block_x].ds; - half2* dsi_dst = - &tile_y_ds[threadIdx.y * (WARP_SIZE_GGUF / QI8_1) + kby]; - - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float* dfi_dst = (float*)dsi_dst; - *dfi_dst = __low2float(*dsi_src); - } - } - } - __syncthreads(); - - // #pragma unroll // unrolling this loop causes too much register pressure - for (int k = ir * WARP_SIZE_GGUF / qr; k < (ir + 1) * WARP_SIZE_GGUF / qr; - k += vdr) { -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { - sum[i / WARP_SIZE_GGUF][j / nwarps] += - vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, - tile_y_ds, threadIdx.x + i, threadIdx.y + j, k); - } - } - } - __syncthreads(); - } - } - -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { - const int col_dst = token_offs[j / nwarps]; - if (col_dst >= ncols_dst) { - return; - } - -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { - const auto row_dst = row_dst_0 + threadIdx.x + i; - if (row_dst >= nrows_dst) { - continue; - } - dst[col_dst * nrows_dst + row_dst] = sum[i / WARP_SIZE_GGUF][j / nwarps]; - } - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q4_0 8 - #define MOE_Y_Q4_0 128 - #define NWARPS_Q4_0 8 -#else - #define MOE_X_Q4_0 4 - #define MOE_Y_Q4_0 32 - #define NWARPS_Q4_0 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2) -#endif - moe_q4_0(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q4_0; - const int mmq_y = MOE_Y_Q4_0; - const int nwarps = NWARPS_Q4_0; - - moe_q, load_tiles_q4_0, - VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q4_0_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - int mmq_x = MOE_X_Q4_0; - int mmq_y = MOE_Y_Q4_0; - int nwarps = NWARPS_Q4_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q4_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q4_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q4_1 8 - #define MOE_Y_Q4_1 128 - #define NWARPS_Q4_1 8 -#else - #define MOE_X_Q4_1 4 - #define MOE_Y_Q4_1 32 - #define NWARPS_Q4_1 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2) -#endif - moe_q4_1(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q4_1; - const int mmq_y = MOE_Y_Q4_1; - const int nwarps = NWARPS_Q4_1; - - moe_q, load_tiles_q4_1, - VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q4_1_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - int mmq_x = MOE_X_Q4_1; - int mmq_y = MOE_Y_Q4_1; - int nwarps = NWARPS_Q4_1; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q4_1<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q4_1<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q5_0 8 - #define MOE_Y_Q5_0 128 - #define NWARPS_Q5_0 8 -#else - #define MOE_X_Q5_0 4 - #define MOE_Y_Q5_0 32 - #define NWARPS_Q5_0 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2) -#endif - moe_q5_0(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q5_0; - const int mmq_y = MOE_Y_Q5_0; - const int nwarps = NWARPS_Q5_0; - - moe_q, load_tiles_q5_0, - VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q5_0_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q5_0; - const int mmq_y = MOE_Y_Q5_0; - const int nwarps = NWARPS_Q5_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q5_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q5_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q5_1 8 - #define MOE_Y_Q5_1 128 - #define NWARPS_Q5_1 8 -#else - #define MOE_X_Q5_1 4 - #define MOE_Y_Q5_1 32 - #define NWARPS_Q5_1 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2) -#endif - moe_q5_1(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q5_1; - const int mmq_y = MOE_Y_Q5_1; - const int nwarps = NWARPS_Q5_1; - - moe_q, load_tiles_q5_1, - VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q5_1_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q5_1; - const int mmq_y = MOE_Y_Q5_1; - const int nwarps = NWARPS_Q5_1; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q5_1<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q5_1<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q8_0 8 - #define MOE_Y_Q8_0 128 - #define NWARPS_Q8_0 8 -#else - #define MOE_X_Q8_0 4 - #define MOE_Y_Q8_0 32 - #define NWARPS_Q8_0 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2) -#endif - moe_q8_0(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q8_0; - const int mmq_y = MOE_Y_Q8_0; - const int nwarps = NWARPS_Q8_0; - - moe_q, load_tiles_q8_0, - VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q8_0_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q8_0; - const int mmq_y = MOE_Y_Q8_0; - const int nwarps = NWARPS_Q8_0; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q8_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q8_0<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q2_K 8 - #define MOE_Y_Q2_K 128 - #define NWARPS_Q2_K 8 -#else - #define MOE_X_Q2_K 4 - #define MOE_Y_Q2_K 32 - #define NWARPS_Q2_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2) -#endif - moe_q2_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q2_K; - const int mmq_y = MOE_Y_Q2_K; - const int nwarps = NWARPS_Q2_K; - - moe_q, load_tiles_q2_K, - VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q2_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q2_K; - const int mmq_y = MOE_Y_Q2_K; - const int nwarps = NWARPS_Q2_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q2_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q2_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q3_K 8 - #define MOE_Y_Q3_K 128 - #define NWARPS_Q3_K 8 -#else - #define MOE_X_Q3_K 4 - #define MOE_Y_Q3_K 32 - #define NWARPS_Q3_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2) -#endif - moe_q3_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - - const int mmq_x = MOE_X_Q3_K; - const int mmq_y = MOE_Y_Q3_K; - const int nwarps = NWARPS_Q3_K; - - moe_q, load_tiles_q3_K, - VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} -template -static void ggml_moe_q3_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q3_K; - const int mmq_y = MOE_Y_Q3_K; - const int nwarps = NWARPS_Q3_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q3_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q3_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q4_K 8 - #define MOE_Y_Q4_K 128 - #define NWARPS_Q4_K 8 -#else - #define MOE_X_Q4_K 4 - #define MOE_Y_Q4_K 32 - #define NWARPS_Q4_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2) -#endif - moe_q4_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q4_K; - const int mmq_y = MOE_Y_Q4_K; - const int nwarps = NWARPS_Q4_K; - - moe_q, load_tiles_q4_K, - VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q4_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q4_K; - const int mmq_y = MOE_Y_Q4_K; - const int nwarps = NWARPS_Q4_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q4_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q4_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q5_K 8 - #define MOE_Y_Q5_K 128 - #define NWARPS_Q5_K 8 -#else - #define MOE_X_Q5_K 4 - #define MOE_Y_Q5_K 32 - #define NWARPS_Q5_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2) -#endif - moe_q5_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q5_K; - const int mmq_y = MOE_Y_Q5_K; - const int nwarps = NWARPS_Q5_K; - - moe_q, load_tiles_q5_K, - VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q5_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q5_K; - const int mmq_y = MOE_Y_Q5_K; - const int nwarps = NWARPS_Q5_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q5_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q5_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} - -#if defined(USE_ROCM) - #define MOE_X_Q6_K 8 - #define MOE_Y_Q6_K 128 - #define NWARPS_Q6_K 8 -#else - #define MOE_X_Q6_K 4 - #define MOE_Y_Q6_K 32 - #define NWARPS_Q6_K 4 -#endif - -template -static __global__ void -#if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2) -#endif - moe_q6_K(const void* __restrict__ vx, const void* __restrict__ vy, - scalar_t* __restrict__ dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, - const int top_k) { - const int mmq_x = MOE_X_Q6_K; - const int mmq_y = MOE_Y_Q6_K; - const int nwarps = NWARPS_Q6_K; - - moe_q, load_tiles_q6_K, - VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>( - vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); -} - -template -static void ggml_moe_q6_K_q8_1_cuda( - const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids, - const int* expert_ids, const int* num_tokens_post_padded, - const int exp_stride, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k, - const int tokens_post_padded, cudaStream_t stream) { - const int mmq_x = MOE_X_Q6_K; - const int mmq_y = MOE_Y_Q6_K; - const int nwarps = NWARPS_Q6_K; - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (tokens_post_padded) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - constexpr bool need_check = false; - moe_q6_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } else { - constexpr bool need_check = true; - moe_q6_K<<>>( - w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded, - exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k); - } -} diff --git a/csrc/libtorch_stable/quantization/gguf/moe_vec.cuh b/csrc/libtorch_stable/quantization/gguf/moe_vec.cuh deleted file mode 100644 index 60f65a1bfdcb..000000000000 --- a/csrc/libtorch_stable/quantization/gguf/moe_vec.cuh +++ /dev/null @@ -1,338 +0,0 @@ -// copied and adapted from -// https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu -template -static __global__ void moe_vec_q(const void* __restrict__ vx, - const void* __restrict__ vy, - scalar_t* __restrict__ dst, - const int* topk_ids, const int topk, - const int ncols, const int nrows, - const int token_stride) { - const auto row = blockIdx.x * blockDim.y + threadIdx.y; - - const auto token = blockIdx.z / topk; - const auto expert = (topk_ids)[blockIdx.z]; - - if (row >= nrows) { - return; - } - - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * WARP_SIZE / qi; - - // partial sum for each thread - float tmp = 0.0f; - - const block_q_t* x = ((const block_q_t*)vx) + expert * nrows * blocks_per_row; - const block_q8_1* y = - (const block_q8_1*)(((const int*)vy) + token * token_stride); - - for (auto i = threadIdx.x / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row * blocks_per_row + i; // x block index - - const int iby = i * (qk / QK8_1); // y block index that aligns with ibx - - const int iqs = - vdr * - (threadIdx.x % - (qi / vdr)); // x block quant index when casting the quants to int - - tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += VLLM_SHFL_XOR_SYNC(tmp, mask); - } - - if (threadIdx.x == 0) { - dst[blockIdx.z * nrows + row] = tmp; - } -} - -template -static void moe_vec_q4_0_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q4_1_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q5_0_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q5_1_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q8_0_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q2_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q3_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q4_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q5_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_q6_K_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_iq2_xxs_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq2_xs_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq2_s_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq3_xxs_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq1_s_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq1_m_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq4_nl_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q<<>>( - vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride); -} - -template -static void moe_vec_iq4_xs_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} - -template -static void moe_vec_iq3_s_q8_1_cuda(const void* vx, const void* vy, - scalar_t* dst, const int* topk_ids, - const int top_k, const int tokens, - const int ncols, const int nrows, - const int token_stride, - cudaStream_t stream) { - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, tokens * top_k); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - moe_vec_q - <<>>(vx, vy, dst, topk_ids, top_k, - ncols, nrows, token_stride); -} From ae0324c64b94bb723a628c2af242ad04ccc6adee Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 20 May 2026 22:25:57 +0800 Subject: [PATCH 20/21] remove ggml bindings Signed-off-by: Isotr0py --- csrc/libtorch_stable/ops.h | 29 ------------------- csrc/libtorch_stable/torch_bindings.cpp | 38 ------------------------- 2 files changed, 67 deletions(-) diff --git a/csrc/libtorch_stable/ops.h b/csrc/libtorch_stable/ops.h index 5ebcb2034f53..006a5af3dd81 100644 --- a/csrc/libtorch_stable/ops.h +++ b/csrc/libtorch_stable/ops.h @@ -220,32 +220,3 @@ torch::stable::Tensor gptq_gemm(torch::stable::Tensor a, void gptq_shuffle(torch::stable::Tensor q_weight, torch::stable::Tensor q_perm, int64_t bit); - -// GGML kernels (shared CUDA/ROCm) -torch::stable::Tensor ggml_dequantize( - torch::stable::Tensor W, int64_t type, int64_t m, int64_t n, - std::optional const& dtype); - -torch::stable::Tensor ggml_mul_mat_vec_a8(torch::stable::Tensor W, - torch::stable::Tensor X, int64_t type, - int64_t row); - -torch::stable::Tensor ggml_mul_mat_a8(torch::stable::Tensor W, - torch::stable::Tensor X, int64_t type, - int64_t row); - -torch::stable::Tensor ggml_moe_a8(torch::stable::Tensor X, - torch::stable::Tensor W, - torch::stable::Tensor sorted_token_ids, - torch::stable::Tensor expert_ids, - torch::stable::Tensor num_tokens_post_padded, - int64_t type, int64_t row, int64_t top_k, - int64_t tokens); - -torch::stable::Tensor ggml_moe_a8_vec(torch::stable::Tensor X, - torch::stable::Tensor W, - torch::stable::Tensor topk_ids, - int64_t top_k, int64_t type, int64_t row, - int64_t tokens); - -int64_t ggml_moe_get_block_size(int64_t type); diff --git a/csrc/libtorch_stable/torch_bindings.cpp b/csrc/libtorch_stable/torch_bindings.cpp index ee0af3da560c..2fd2060dcebc 100644 --- a/csrc/libtorch_stable/torch_bindings.cpp +++ b/csrc/libtorch_stable/torch_bindings.cpp @@ -341,34 +341,6 @@ STABLE_TORCH_LIBRARY_FRAGMENT(_C, ops) { // Post processing for GPTQ. ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()"); - - // Dequantization for GGML. - ops.def( - "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? " - "dtype) -> Tensor"); - - // mmvq kernel for GGML. - ops.def( - "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) " - "-> Tensor"); - - // mmq kernel for GGML. - ops.def( - "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor"); - - // moe kernel for GGML. - ops.def( - "ggml_moe_a8(Tensor X, Tensor W, " - "Tensor sorted_token_ids, Tensor expert_ids, Tensor " - "num_tokens_post_padded, " - "int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor"); - - ops.def( - "ggml_moe_a8_vec(Tensor X, Tensor W, " - "Tensor topk_ids, int top_k, " - "int type, SymInt row, SymInt tokens) -> Tensor"); - - ops.def("ggml_moe_get_block_size(int type) -> int"); } STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, ops) { @@ -441,13 +413,6 @@ STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, ops) { // GPTQ kernels ops.impl("gptq_gemm", TORCH_BOX(&gptq_gemm)); ops.impl("gptq_shuffle", TORCH_BOX(&gptq_shuffle)); - - // GGML kernels - ops.impl("ggml_dequantize", TORCH_BOX(&ggml_dequantize)); - ops.impl("ggml_mul_mat_vec_a8", TORCH_BOX(&ggml_mul_mat_vec_a8)); - ops.impl("ggml_mul_mat_a8", TORCH_BOX(&ggml_mul_mat_a8)); - ops.impl("ggml_moe_a8", TORCH_BOX(&ggml_moe_a8)); - ops.impl("ggml_moe_a8_vec", TORCH_BOX(&ggml_moe_a8_vec)); } // These capability-check functions take only primitive args (no tensors), so @@ -465,9 +430,6 @@ STABLE_TORCH_LIBRARY_IMPL(_C, CompositeExplicitAutograd, ops) { ops.impl("cutlass_scaled_mm_supports_fp4", TORCH_BOX(&cutlass_scaled_mm_supports_fp4)); #endif - - // GGML block size lookup (no tensor args) - ops.impl("ggml_moe_get_block_size", TORCH_BOX(&ggml_moe_get_block_size)); } REGISTER_EXTENSION(_C_stable_libtorch) From c33cf711a00049ff7cad29a8117234f2aec6cb47 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 20 May 2026 23:01:21 +0800 Subject: [PATCH 21/21] fix build Signed-off-by: Isotr0py --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d5039470d41b..a4c9a7fd3e36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -632,8 +632,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") "csrc/libtorch_stable/activation_kernels.cu" "csrc/libtorch_stable/quantization/w8a8/int8/scaled_quant.cu" "csrc/libtorch_stable/quantization/w8a8/fp8/common.cu" - "csrc/libtorch_stable/quantization/gptq/q_gemm.cu" - "csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu") + "csrc/libtorch_stable/quantization/gptq/q_gemm.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_STABLE_EXT_SRC