From 937eae3a34c94998eae4bcaf0525aaab609726df Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 12 Apr 2026 13:44:05 +0800
Subject: [PATCH 01/21] draft gguf remove

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .github/dependabot.yml                        |    1 -
 .pre-commit-config.yaml                       |    2 +-
 CMakeLists.txt                                |    1 -
 csrc/ops.h                                    |   24 +-
 csrc/quantization/gguf/dequantize.cuh         |  571 ------
 csrc/quantization/gguf/ggml-common.h          | 1150 -----------
 csrc/quantization/gguf/gguf_kernel.cu         |  542 -----
 csrc/quantization/gguf/mmq.cuh                |  610 ------
 csrc/quantization/gguf/mmvq.cuh               |  212 --
 csrc/quantization/gguf/moe.cuh                |  739 -------
 csrc/quantization/gguf/moe_vec.cuh            |  338 ---
 csrc/quantization/gguf/vecdotq.cuh            | 1812 -----------------
 csrc/torch_bindings.cpp                       |   33 -
 docs/features/quantization/README.md          |    2 -
 docs/features/quantization/gguf.md            |   87 -
 docs/mkdocs/hooks/generate_examples.py        |    1 -
 requirements/common.txt                       |    1 -
 requirements/test/rocm.txt                    |    8 -
 tests/compile/fullgraph/test_full_graph.py    |    6 -
 tests/kernels/quantization/test_ggml.py       |   54 -
 tests/kernels/quantization/test_gguf.py       |  207 --
 .../generation/test_multimodal_gguf.py        |  180 --
 tests/models/quantization/test_gguf.py        |  204 --
 tests/models/test_gguf_download.py            |  221 --
 tests/transformers_utils/test_utils.py        |  210 --
 vllm/_custom_ops.py                           |  128 --
 vllm/config/load.py                           |    2 -
 vllm/config/model.py                          |   26 +-
 vllm/engine/arg_utils.py                      |   10 +-
 vllm/model_executor/layers/fused_moe/layer.py |   13 +-
 vllm/model_executor/layers/linear.py          |   72 +-
 .../layers/quantization/__init__.py           |    3 -
 .../layers/quantization/base_config.py        |   26 +
 .../layers/quantization/gguf.py               |  691 -------
 .../layers/vocab_parallel_embedding.py        |    6 +-
 vllm/model_executor/model_loader/__init__.py  |    4 -
 .../model_loader/gguf_loader.py               |  436 ----
 .../model_loader/weight_utils.py              |  168 +-
 vllm/model_executor/models/apertus.py         |    7 +-
 vllm/model_executor/models/exaone.py          |    6 +-
 vllm/model_executor/models/exaone4.py         |    6 +-
 vllm/model_executor/models/gemma3.py          |   12 +-
 vllm/model_executor/models/jais2.py           |    6 +-
 vllm/model_executor/models/llama.py           |    7 +-
 vllm/model_executor/models/llama4.py          |    7 +-
 vllm/model_executor/models/openpangu.py       |   21 +-
 vllm/model_executor/models/siglip.py          |   15 +-
 vllm/model_format.py                          |  162 ++
 vllm/platforms/rocm.py                        |    1 -
 vllm/tokenizers/registry.py                   |   31 +-
 vllm/transformers_utils/config.py             |  136 +-
 vllm/transformers_utils/gguf_utils.py         |  336 ---
 vllm/transformers_utils/processor.py          |   20 +-
 vllm/v1/metrics/perf.py                       |    1 -
 54 files changed, 359 insertions(+), 9216 deletions(-)
 delete mode 100644 csrc/quantization/gguf/dequantize.cuh
 delete mode 100644 csrc/quantization/gguf/ggml-common.h
 delete mode 100644 csrc/quantization/gguf/gguf_kernel.cu
 delete mode 100644 csrc/quantization/gguf/mmq.cuh
 delete mode 100644 csrc/quantization/gguf/mmvq.cuh
 delete mode 100644 csrc/quantization/gguf/moe.cuh
 delete mode 100644 csrc/quantization/gguf/moe_vec.cuh
 delete mode 100644 csrc/quantization/gguf/vecdotq.cuh
 delete mode 100644 docs/features/quantization/gguf.md
 delete mode 100644 tests/kernels/quantization/test_ggml.py
 delete mode 100644 tests/kernels/quantization/test_gguf.py
 delete mode 100644 tests/models/multimodal/generation/test_multimodal_gguf.py
 delete mode 100644 tests/models/quantization/test_gguf.py
 delete mode 100644 tests/models/test_gguf_download.py
 delete mode 100644 vllm/model_executor/layers/quantization/gguf.py
 delete mode 100644 vllm/model_executor/model_loader/gguf_loader.py
 create mode 100644 vllm/model_format.py
 delete mode 100644 vllm/transformers_utils/gguf_utils.py

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index a017d69be991..944929fc55e5 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -21,7 +21,6 @@ updates:
       - dependency-name: "torchvision"
       - dependency-name: "xformers"
       - dependency-name: "lm-format-enforcer"
-      - dependency-name: "gguf"
       - dependency-name: "compressed-tensors"
       - dependency-name: "ray[cgraph]" # Ray Compiled Graph
       - dependency-name: "lm-eval"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 33b1db69dec4..6896b4494f27 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
   rev: v21.1.2
   hooks:
   - id: clang-format
-    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
+    exclude: 'csrc/moe/topk_softmax_kernels.cu|vllm/third_party/.*'
     types_or: [c++, cuda]
     args: [--style=file, --verbose]
 - repo: https://github.com/DavidAnson/markdownlint-cli2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f24c12eff83c..d22252f69280 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,7 +300,6 @@ set(VLLM_EXT_SRC
   "csrc/quantization/w8a8/fp8/common.cu"
   "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "csrc/quantization/fused_kernels/fused_silu_mul_block_quant.cu"
-  "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/quantization/activation_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/custom_all_reduce.cu"
diff --git a/csrc/ops.h b/csrc/ops.h
index da066512b7c1..2ef62b801334 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -203,28 +203,6 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
 
 #endif
 
-torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
-                              int64_t n,
-                              std::optional<at::ScalarType> const& dtype);
-
-torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
-                                  int64_t type, int64_t row);
-
-torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
-                              int64_t row);
-
-torch::Tensor ggml_moe_a8(torch::Tensor X, torch::Tensor W,
-                          torch::Tensor sorted_token_ids,
-                          torch::Tensor expert_ids,
-                          torch::Tensor num_tokens_post_padded, int64_t type,
-                          int64_t row, int64_t top_k, int64_t tokens);
-
-torch::Tensor ggml_moe_a8_vec(torch::Tensor X, torch::Tensor W,
-                              torch::Tensor topk_ids, int64_t top_k,
-                              int64_t type, int64_t row, int64_t tokens);
-
-int64_t ggml_moe_get_block_size(int64_t type);
-
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor const& scale,
                               std::optional<torch::Tensor> const& azp);
@@ -320,4 +298,4 @@ std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk(
     torch::Tensor const& norm_weight_k, torch::Tensor workspace,
     int64_t const q_size, int64_t const kv_size, int64_t const rank,
     int64_t const nranks, double const eps);
-#endif
\ No newline at end of file
+#endif
diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh
deleted file mode 100644
index 9d355003ef91..000000000000
--- a/csrc/quantization/gguf/dequantize.cuh
+++ /dev/null
@@ -1,571 +0,0 @@
-// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/convert.cu
-// Dequant functions
-static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q4_0 * x = (const block_q4_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x = __int2half_rn(vui & 0xF);
-    v.y = __int2half_rn(vui >> 4);
-
-    v = __hsub2(v, __floats2half2_rn(8.0f, 8.0f));
-    v = __hmul2(v, {d, d});
-}
-
-static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-
-    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = __high2half(x[ib].dm);
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x = __int2half_rn(vui & 0xF);
-    v.y = __int2half_rn(vui >> 4);
-
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-}
-
-static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q5_0 * x = (const block_q5_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y = __int2half_rn((x[ib].qs[iqs] >>  4) | xh_1);
-
-    v = __hsub2(v, __floats2half2_rn(16.0f, 16.0f));
-    v = __hmul2(v, {d, d});
-}
-
-static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q5_1 * x = (const block_q5_1 *) vx;
-
-    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = __high2half(x[ib].dm);
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y = __int2half_rn((x[ib].qs[iqs] >>  4) | xh_1);
-
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-}
-
-static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q8_0 * x = (const block_q8_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    v.x = __int2half_rn(x[ib].qs[iqs + 0]);
-    v.y = __int2half_rn(x[ib].qs[iqs + 1]);
-
-    v = __hmul2(v, {d, d});
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
-    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
-
-    if (i >= k) {
-        return;
-    }
-
-    const int ib = i/qk; // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(vx, ib, iqs, v);
-
-    y[iybs + iqs + 0]        = convert_from_half<dst_t>(v.x);
-    y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const auto i   = blockIdx.x;
-    const block_q2_K * x = (const block_q2_K *) vx;
-
-    const auto tid = threadIdx.x;
-    const int n   = tid/32;
-    const int l   = tid - 32*n;
-    const int is  = 8*n + l/16;
-
-    const uint8_t q = x[i].qs[32*n + l];
-    dst_t * y = yy + i*QK_K + 128*n;
-
-    half dall = __low2half(x[i].dm);
-    half dmin = __high2half(x[i].dm);
-    y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4))));
-    y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4))));
-    y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4))));
-    y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4))));
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const auto i = blockIdx.x;
-    const block_q3_K * x = (const block_q3_K *) vx;
-
-    const auto r = threadIdx.x/4;
-    const int tid = r/2;
-    const int is0 = r%2;
-    const int l0 = 16*is0 + 4*(threadIdx.x%4);
-    const int n = tid / 4;
-    const int j = tid - 4*n;
-
-    uint8_t m = 1 << (4*n + j);
-    int is = 8*n + 2*j + is0;
-    int shift = 2*j;
-
-    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
-                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
-                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
-                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
-    half d_all = x[i].d;
-    half dl = __hmul(d_all,  __int2half_rn(us - 32));
-
-    dst_t * y = yy + i*QK_K + 128*n + 32*j;
-    const uint8_t * q = x[i].qs + 32*n;
-    const uint8_t * hm = x[i].hmask;
-
-    for (int l = l0; l < l0+4; ++l) {
-        y[l] = convert_from_half<dst_t>(__hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
-    }
-}
-
-static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
-    if (j < 4) {
-        d = q[j] & 63; m = q[j + 4] & 63;
-    } else {
-        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q4_K * x = (const block_q4_K *) vx;
-
-    const auto i = blockIdx.x;
-
-    // assume 32 threads
-    const auto tid = threadIdx.x;
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int is  = 2*il;
-    const int n   = 4;
-
-    dst_t * y = yy + i*QK_K + 64*il + n*ir;
-
-    const half dall = __low2half(x[i].dm);
-    const half dmin = __high2half(x[i].dm);
-
-    const uint8_t * q = x[i].qs + 32*il + n*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const half d1 = __hmul(dall, __int2half_rn(sc));
-    const half m1 = __hmul(dmin,  __int2half_rn(m));
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const half d2 = __hmul(dall, __int2half_rn(sc));
-    const half m2 = __hmul(dmin, __int2half_rn(m));
-    for (int l = 0; l < n; ++l) {
-        y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
-        y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2));
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q5_K * x = (const block_q5_K *) vx;
-
-    const auto i = blockIdx.x;
-
-    // assume 64 threads - this is very slightly better than the one below
-    const auto tid = threadIdx.x;
-    const int il  = tid/16;   // il is in 0...3
-    const int ir  = tid%16;   // ir is in 0...15
-    const int is  = 2*il;     // is is in 0...6
-
-    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
-
-    const half dall = __low2half(x[i].dm);
-    const half dmin = __high2half(x[i].dm);
-
-    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
-    const uint8_t * qh = x[i].qh + 2*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const half d1 = __hmul(dall, __int2half_rn(sc)); const half m1 = __hmul(dmin, __int2half_rn(m));
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
-
-    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
-    y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
-    hm <<= 1;
-    y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2));
-    y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2));
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q6_K * x = (const block_q6_K *) vx;
-
-    const auto i = blockIdx.x;
-
-    // assume 64 threads - this is very slightly better than the one below
-    const auto tid = threadIdx.x;
-    const int ip  = tid/32;   // ip is 0 or 1
-    const int il  = tid - 32*ip; // 0...32
-    const int is  = 8*ip + il/16;
-
-    dst_t * y = yy + i*QK_K + 128*ip + il;
-
-    const half d = x[i].d;
-
-    const uint8_t * ql = x[i].ql + 64*ip + il;
-    const uint8_t   qh = x[i].qh[32*ip + il];
-    const int8_t  * sc = x[i].scales + is;
-
-    y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
-    y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
-    y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
-    y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const auto i   = blockIdx.x;
-    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
-
-    const auto tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint16_t * q2 = x[i].qs + 4*ib;
-    const uint8_t  * aux8 = (const uint8_t *)q2;
-    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
-    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
-    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const auto i   = blockIdx.x;
-    const block_iq2_xs * x = (const block_iq2_xs *) vx;
-
-    const auto tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint16_t * q2 = x[i].qs + 4*ib;
-    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
-    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
-    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const auto i   = blockIdx.x;
-    const block_iq2_s * x = (const block_iq2_s *) vx;
-
-    const auto tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
-    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
-    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const auto i   = blockIdx.x;
-    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
-
-    const auto tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint8_t  * q3 = x[i].qs + 8*ib;
-    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
-    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
-    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
-    const uint32_t aux32 = gas[0] | (gas[1] << 16);
-    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
-    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 4; ++j) {
-        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
-        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const auto i   = blockIdx.x;
-    const block_iq3_s * x = (const block_iq3_s *) vx;
-
-    const auto tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint8_t * qs = x[i].qs + 8*ib;
-    const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
-    const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
-    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
-    const uint8_t signs = x[i].signs[4*ib + il];
-    for (int j = 0; j < 4; ++j) {
-        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
-        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int64_t i   = blockIdx.x;
-    const block_iq1_s * x = (const block_iq1_s  *) vx;
-
-    const int64_t tid = threadIdx.x;
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
-    const float d = __half2float(x[i].d) * (2*((x[i].qh[ib] >> 12) & 7) + 1);
-    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
-    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
-    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
-    grid32[0] &= 0x0f0f0f0f;
-    for (int j = 0; j < 8; ++j) {
-        y[j] = d * (q[j] + delta);
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int64_t i   = blockIdx.x;
-    const block_iq1_m * x = (const block_iq1_m  *) vx;
-
-    const int64_t tid = threadIdx.x;
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint16_t * sc = (const uint16_t *)x[i].scales;
-    iq1m_scale_t scale;
-    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-    const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
-    const float d = __half2float(scale.f16) * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
-    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
-    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
-    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
-    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
-    grid32[0] &= 0x0f0f0f0f;
-    for (int j = 0; j < 8; ++j) {
-        y[j] = d * (q[j] + delta);
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const auto i   = blockIdx.x;
-    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
-
-    const auto tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
-    const uint8_t  * q4 = x[ib].qs + 4*il;
-    const float d = __half2float(x[ib].d);
-    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
-        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
-    }
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const auto i   = blockIdx.x;
-    const block_iq4_xs * x = (const block_iq4_xs *)vx;
-
-    const auto tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
-    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
-    const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
-    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
-        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
-    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
-}
-
-template<typename dst_t>
-static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = (k + QK_K - 1) / QK_K;
-    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = (k + QK_K - 1) / QK_K;
-    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
-    switch (type) {
-        case 2:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
-        case 3:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
-        case 6:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case 7:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case 8:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case 10:
-            return dequantize_row_q2_K_cuda;
-        case 11:
-            return dequantize_row_q3_K_cuda;
-        case 12:
-            return dequantize_row_q4_K_cuda;
-        case 13:
-            return dequantize_row_q5_K_cuda;
-        case 14:
-            return dequantize_row_q6_K_cuda;
-        case 16:
-            return dequantize_row_iq2_xxs_cuda;
-        case 17:
-            return dequantize_row_iq2_xs_cuda;
-        case 18:
-            return dequantize_row_iq3_xxs_cuda;
-        case 19:
-            return dequantize_row_iq1_s_cuda;
-        case 20:
-            return dequantize_row_iq4_nl_cuda;
-        case 21:
-            return dequantize_row_iq3_s_cuda;
-        case 22:
-            return dequantize_row_iq2_s_cuda;
-        case 23:
-            return dequantize_row_iq4_xs_cuda;
-        case 29:
-            return dequantize_row_iq1_m_cuda;
-        default:
-            return nullptr;
-    }
-}
diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
deleted file mode 100644
index 6bef5db3ccf1..000000000000
--- a/csrc/quantization/gguf/ggml-common.h
+++ /dev/null
@@ -1,1150 +0,0 @@
-// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h
-#define QK_K 256
-#define K_QUANTS_PER_ITERATION 2
-#define WARP_SIZE_GGUF 32
-#define K_SCALE_SIZE 12
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_QUANTIZE_BLOCK_SIZE 256
-#define GGML_CUDA_DMMV_X 32
-#define GGML_CUDA_MMV_Y 1
-
-
-// Data Structures
-// QK = number of values after dequantization
-// QR = QK / number of values before dequantization
-// QI = number of 32 bit integers before dequantization
-
-#define QK4_0 32
-#define QR4_0 2
-#define QI4_0 (QK4_0 / (4 * QR4_0))
-typedef struct {
-    half    d;              // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-
-#define QK4_1 32
-#define QR4_1 2
-#define QI4_1 (QK4_1 / (4 * QR4_1))
-typedef struct {
-    half2   dm;             // dm.x = delta, dm.y = min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-
-#define QK5_0 32
-#define QR5_0 2
-#define QI5_0 (QK5_0 / (4 * QR5_0))
-typedef struct {
-    half d;                 // delta
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} block_q5_0;
-
-#define QK5_1 32
-#define QR5_1 2
-#define QI5_1 (QK5_1 / (4 * QR5_1))
-typedef struct {
-    half2 dm;               // dm.x = delta, dm.y = min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-
-#define QK8_0 32
-#define QR8_0 1
-#define QI8_0 (QK8_0 / (4 * QR8_0))
-typedef struct {
-    half    d;              // delta
-    int8_t  qs[QK8_0];      // quants
-} block_q8_0;
-
-#define QK8_1 32
-#define QR8_1 1
-#define QI8_1 (QK8_1 / (4 * QR8_1))
-typedef struct {
-    half2   ds;             // ds.x = delta, ds.y = sum
-    int8_t  qs[QK8_0];      // quants
-} block_q8_1;
-
-#define QR2_K 4
-#define QI2_K (QK_K / (4*QR2_K))
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    half2 dm;                // super-block scale for quantized scales/mins
-} block_q2_K;
-
-#define QR3_K 4
-#define QI3_K (QK_K / (4*QR3_K))
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-    half d;             // super-block scale
-} block_q3_K;
-
-#define QR4_K 2
-#define QI4_K (QK_K / (4*QR4_K))
-typedef struct {
-    half2 dm;                  // super-block scale for quantized scales/mins
-    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-
-#define QR5_K 2
-#define QI5_K (QK_K / (4*QR5_K))
-typedef struct {
-    half2 dm;                     // super-block scale for quantized scales/mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-
-#define QR6_K 2
-#define QI6_K (QK_K / (4*QR6_K))
-typedef struct {
-    uint8_t ql[QK_K/2];   // quants, lower 4 bits
-    uint8_t qh[QK_K/4];   // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales
-    half    d;         // delta
-} block_q6_K;
-
-#define QR2_XXS 8
-#define QI2_XXS (QK_K / (4*QR2_XXS))
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-
-#define QR2_XS 8
-#define QI2_XS (QK_K / (4*QR2_XS))
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-
-#define QR2_S 8
-#define QI2_S (QK_K / (4*QR2_S))
-typedef struct {
-    half d;
-    uint8_t qs[QK_K/4];
-    uint8_t qh[QK_K/32];
-    uint8_t scales[QK_K/32];
-} block_iq2_s;
-
-#define QR3_XXS 8
-#define QI3_XXS (QK_K / (4*QR3_XXS))
-typedef struct {
-    half d;
-    uint8_t qs[3*(QK_K/8)];
-} block_iq3_xxs;
-
-#define QR3_XS 8
-#define QI3_XS (QK_K / (4*QR3_XS))
-#define IQ3S_N_SCALE QK_K/64
-typedef struct {
-    half d;
-    uint8_t qs[QK_K/4];
-    uint8_t qh[QK_K/32];
-    uint8_t signs[QK_K/8];
-    uint8_t scales[IQ3S_N_SCALE];
-} block_iq3_s;
-
-// 1.5625 bpw
-#define QR1_S 8
-#define QI1_S (QK_K / (4*QR1_S))
-typedef struct {
-    half d;
-    uint8_t  qs[QK_K/8];
-    uint16_t qh[QK_K/32];
-} block_iq1_s;
-
-// 1.75 bpw
-#define QR1_M 8
-#define QI1_M (QK_K / (4*QR1_M))
-typedef struct {
-    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
-    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
-    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
-} block_iq1_m;
-
-// Used by IQ1_M quants
-typedef union {
-    half f16;
-    uint16_t  u16;
-} iq1m_scale_t;
-
-#define QK4_NL 32
-#define QR4_NL 2
-#define QI4_NL (QK4_NL / (4*QR4_NL))
-typedef struct {
-    half d;
-    uint8_t qs[QK4_NL/2];
-} block_iq4_nl;
-
-#define QR4_XS 8
-#define QI4_XS (QK_K / (4*QR4_XS))
-typedef struct {
-    half d;
-    uint16_t scales_h;
-    uint8_t  scales_l[QK_K/64];
-    uint8_t  qs[QK_K/2];
-} block_iq4_xs;
-
-static const __device__ uint64_t iq2xxs_grid[256] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-};
-
-static const __device__ uint64_t iq2xs_grid[512] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const __device__ uint64_t iq2s_grid[1024] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
-    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
-    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
-    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
-    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
-    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
-    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
-    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
-    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
-    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
-    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
-    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
-    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
-    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
-    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
-    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
-    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
-    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
-    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
-    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
-    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
-    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
-    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
-    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
-    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
-    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
-    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
-    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
-    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
-    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
-    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
-    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
-    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
-    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
-    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
-    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
-    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
-    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
-    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
-    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
-    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
-    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
-    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
-    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
-    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
-    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
-    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
-    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
-    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
-    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
-    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
-    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
-    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
-    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
-    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
-    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
-    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
-    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
-    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
-    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
-    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
-    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
-    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
-    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
-    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
-    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
-    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
-    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
-    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
-    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
-    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
-    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
-    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
-    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
-    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
-    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
-    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
-    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
-    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
-    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
-    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
-    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
-    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
-    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
-    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
-    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
-    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
-    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
-    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
-    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
-    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
-    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
-    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
-    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
-    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
-    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
-    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
-    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
-    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
-    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
-    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
-    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
-    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
-    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
-    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
-    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
-    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
-    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
-    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
-    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
-    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
-    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
-    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
-    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
-    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
-    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
-    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
-    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
-    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
-    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
-    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
-    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
-    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
-    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
-    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
-    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
-    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
-    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
-    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
-    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
-    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
-    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
-    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
-    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
-    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
-    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
-    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
-    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
-    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
-    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
-    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
-    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
-    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
-    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
-    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
-    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
-    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
-    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
-    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
-    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
-    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
-    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
-    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
-    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
-    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
-    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
-    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
-    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
-    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
-    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
-    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
-    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
-    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
-    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
-    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
-    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
-    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
-    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
-    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
-    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
-    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
-    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
-    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
-    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
-    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
-    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
-    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
-    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
-    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
-    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
-    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
-    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
-    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
-    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
-    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
-    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
-    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
-    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
-    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
-    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
-    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
-    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
-    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
-    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
-    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
-    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
-    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
-    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
-    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
-    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
-    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
-    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
-    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
-    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
-    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
-    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
-    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
-    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
-    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
-    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
-    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
-    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
-    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
-    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
-    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
-    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
-    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
-    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
-    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
-    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
-    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
-    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
-    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
-    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
-    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
-    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
-    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
-    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
-    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
-    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
-    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
-    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
-    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
-    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
-    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
-    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
-    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
-    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
-    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
-    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
-    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
-    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
-    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const __device__ uint32_t iq3xxs_grid[256] = {
-    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
-    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
-    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
-    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
-    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
-    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
-    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
-    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
-    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
-    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
-    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
-    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
-    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
-    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
-    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
-    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
-    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
-    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
-    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
-    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
-    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
-    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
-    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
-    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
-    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
-    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
-    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
-    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
-    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
-    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
-    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
-    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
-};
-
-static const __device__ uint32_t iq3xs_grid[512] = {
-    0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
-    0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
-    0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
-    0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
-    0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
-    0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
-    0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
-    0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
-    0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
-    0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
-    0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
-    0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
-    0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
-    0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
-    0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
-    0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
-    0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
-    0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
-    0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
-    0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
-    0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
-    0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
-    0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
-    0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
-    0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
-    0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
-    0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
-    0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
-    0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
-    0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
-    0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
-    0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
-    0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
-    0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
-    0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
-    0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
-    0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
-    0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
-    0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
-    0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
-    0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
-    0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
-    0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
-    0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
-    0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
-    0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
-    0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
-    0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
-    0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
-    0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
-    0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
-    0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
-    0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
-    0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
-    0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
-    0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
-    0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
-    0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
-    0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
-    0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
-    0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
-    0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
-    0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
-    0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
-};
-
-#define IQ1S_DELTA 0.125f
-#define IQ1M_DELTA 0.125f
-static const __device__ uint64_t iq1s_grid_gpu[2048] = {
-    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
-    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
-    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
-    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
-    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
-    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
-    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
-    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
-    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
-    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
-    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
-    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
-    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
-    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
-    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
-    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
-    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
-    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
-    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
-    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
-    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
-    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
-    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
-    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
-    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
-    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
-    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
-    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
-    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
-    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
-    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
-    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
-    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
-    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
-    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
-    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
-    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
-    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
-    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
-    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
-    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
-    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
-    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
-    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
-    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
-    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
-    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
-    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
-    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
-    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
-    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
-    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
-    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
-    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
-    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
-    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
-    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
-    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
-    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
-    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
-    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
-    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
-    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
-    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
-    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
-    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
-    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
-    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
-    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
-    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
-    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
-    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
-    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
-    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
-    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
-    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
-    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
-    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
-    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
-    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
-    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
-    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
-    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
-    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
-    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
-    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
-    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
-    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
-    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
-    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
-    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
-    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
-    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
-    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
-    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
-    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
-    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
-    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
-    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
-    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
-    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
-    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
-    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
-    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
-    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
-    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
-    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
-    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
-    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
-    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
-    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
-    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
-    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
-    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
-    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
-    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
-    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
-    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
-    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
-    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
-    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
-    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
-    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
-    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
-    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
-    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
-    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
-    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
-    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
-    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
-    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
-    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
-    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
-    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
-    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
-    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
-    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
-    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
-    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
-    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
-    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
-    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
-    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
-    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
-    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
-    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
-    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
-    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
-    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
-    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
-    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
-    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
-    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
-    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
-    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
-    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
-    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
-    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
-    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
-    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
-    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
-    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
-    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
-    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
-    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
-    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
-    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
-    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
-    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
-    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
-    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
-    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
-    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
-    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
-    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
-    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
-    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
-    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
-    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
-    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
-    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
-    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
-    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
-    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
-    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
-    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
-    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
-    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
-    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
-    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
-    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
-    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
-    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
-    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
-    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
-    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
-    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
-    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
-    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
-    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
-    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
-    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
-    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
-    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
-    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
-    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
-    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
-    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
-    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
-    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
-    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
-    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
-    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
-    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
-    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
-    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
-    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
-    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
-    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
-    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
-    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
-    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
-    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
-    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
-    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
-    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
-    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
-    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
-    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
-    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
-    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
-    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
-    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
-    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
-    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
-    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
-    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
-    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
-    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
-    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
-    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
-    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
-    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
-    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
-    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
-    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
-    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
-    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
-    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
-    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
-    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
-    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
-    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
-    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
-    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
-    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
-};
-
-static const __device__ uint8_t ksigns_iq2xs[128] = {
-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-};
-
-static const __device__ uint64_t ksigns64[128] = {
-    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
-    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
-    0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
-    0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
-    0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
-    0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
-    0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
-    0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
-    0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
-    0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
-    0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
-    0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
-    0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
-    0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
-    0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
-    0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
-    0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
-    0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
-    0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
-    0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
-    0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
-    0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
-    0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
-    0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
-    0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
-    0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
-    0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
-    0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
-    0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
-    0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
-    0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
-    0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
-};
-
-static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-
-
-typedef half dfloat; // dequantize float
-typedef half2 dfloat2;
-typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-template<typename dst_t>
-using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
-typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
-typedef void (*load_tiles_cuda_t)(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_cuda_t)(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
-
-// Utility function
-
-template<typename dst_t>
-static __device__ __forceinline__ dst_t convert_from_half(half val) {
-    return val;
-}
-
-template<>
-__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-    return __float2bfloat16(__half2float(val));
-#else
-    return __half2float(val);
-#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-}
-
-template<>
-__device__ __forceinline__ float convert_from_half<float>(half val) {
-    return __half2float(val);
-}
-
-#if defined(USE_ROCM)
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-    return reinterpret_cast<const int &>(c);
-#else
-    int8x4_t c;
-    int16_t tmp;
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-        tmp = va[i] - vb[i];
-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
-        c[i] = tmp;
-    }
-    return reinterpret_cast<int &>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
-#if __has_builtin(__builtin_amdgcn_sdot4)
-    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#else
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
-#endif
-    return c;
-}
-
-static __device__ __forceinline__ uint32_t __vcmpeq4(const uint32_t a, const uint32_t b) {
-    uint32_t neq = a^b;
-    return !(neq & 0xff000000) * 0xff000000 |
-           !(neq & 0x00ff0000) * 0x00ff0000 |
-           !(neq & 0x0000ff00) * 0x0000ff00 |
-           !(neq & 0x000000ff) * 0x000000ff;
-}
-
-static __device__ __forceinline__ uint32_t __vsub4(const uint32_t a, const uint32_t b) {
-    return (static_cast<uint8_t>(((a & 0xff000000) >> 24) - ((b & 0xff000000) >> 24)) << 24) +
-           (static_cast<uint8_t>(((a & 0x00ff0000) >> 16) - ((b & 0x00ff0000) >> 16)) << 16) +
-           (static_cast<uint8_t>(((a & 0x0000ff00) >>  8) - ((b & 0x0000ff00) >>  8)) <<  8) +
-           (static_cast<uint8_t>(((a & 0x000000ff) >>  0) - ((b & 0x000000ff) >>  0)) <<  0);
-}
-#endif // defined(USE_ROCM)
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
deleted file mode 100644
index 76fe73e95040..000000000000
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ /dev/null
@@ -1,542 +0,0 @@
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include "../../cuda_compat.h"
-#include "dispatch_utils.h"
-
-#include "ggml-common.h"
-#include "vecdotq.cuh"
-#include "dequantize.cuh"
-#include "mmvq.cuh"
-#include "mmq.cuh"
-#include "moe.cuh"
-#include "moe_vec.cuh"
-
-// Q8 gemv
-template <typename scalar_t>
-static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
-                                     void* __restrict__ vy, const int kx,
-                                     const int kx_padded) {
-  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
-  if (ix >= kx_padded) {
-    return;
-  }
-  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
-  const int i_padded = iy * kx_padded + ix;
-
-  block_q8_1* y = (block_q8_1*)vy;
-
-  const int ib = i_padded / QK8_1;   // block index
-  const int iqs = i_padded % QK8_1;  // quant index
-
-  const float xi = ix < kx ? static_cast<float>(x[iy * kx + ix]) : 0.0f;
-  float amax = fabsf(xi);
-  float sum = xi;
-
-#pragma unroll
-  for (int mask = 16; mask > 0; mask >>= 1) {
-    amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32));
-    sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32);
-  }
-
-  const float d = amax / 127;
-  const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
-
-  y[ib].qs[iqs] = q;
-
-  if (iqs > 0) {
-    return;
-  }
-
-  y[ib].ds.x = __float2half(d);
-  y[ib].ds.y = __float2half(sum);
-}
-
-template <typename scalar_t>
-static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
-                                   const int ky, cudaStream_t stream) {
-  const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
-  const int block_num_x =
-      (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-  constexpr int MAX_BLOCK_SIZE = 65535;
-  for (int off = 0; off < ky; off += MAX_BLOCK_SIZE) {
-    const int num_blocks_y = std::min(ky, off + MAX_BLOCK_SIZE) - off;
-    const dim3 num_blocks(block_num_x, num_blocks_y, 1);
-    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(
-        &x[off * kx], (int32_t*)vy + off * (kx_padded / 32 * 9), kx, kx_padded);
-  }
-}
-
-torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
-                              int64_t type, int64_t m, int64_t n,
-                              std::optional<at::ScalarType> const& dtype) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
-  auto dtype_ = dtype.value_or(torch::kFloat16);
-  auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
-  at::Tensor DW = torch::empty({m, n}, options);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-
-  VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
-    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
-    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
-  });
-
-  return DW;
-}
-
-torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W,  // quant weight
-                                  torch::Tensor X,  // input
-                                  int64_t type, int64_t row) {
-  int col = X.sizes()[1];
-  int vecs = X.sizes()[0];
-  const int padded = (col + 512 - 1) / 512 * 512;
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
-  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
-  at::Tensor Y = torch::empty({vecs, row}, options);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
-  at::Tensor quant_X = torch::empty({vecs, padded / 32 * 9}, options);
-  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_vec_a8", [&] {
-    quantize_row_q8_1_cuda<scalar_t>(
-        (scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), col, vecs, stream);
-    switch (type) {
-      case 2:
-        mul_mat_vec_q4_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 3:
-        mul_mat_vec_q4_1_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 6:
-        mul_mat_vec_q5_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 7:
-        mul_mat_vec_q5_1_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 8:
-        mul_mat_vec_q8_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 10:
-        mul_mat_vec_q2_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 11:
-        mul_mat_vec_q3_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 12:
-        mul_mat_vec_q4_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 13:
-        mul_mat_vec_q5_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 14:
-        mul_mat_vec_q6_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 16:
-        mul_mat_vec_iq2_xxs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 17:
-        mul_mat_vec_iq2_xs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 18:
-        mul_mat_vec_iq3_xxs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 19:
-        mul_mat_vec_iq1_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 20:
-        mul_mat_vec_iq4_nl_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 21:
-        mul_mat_vec_iq3_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 22:
-        mul_mat_vec_iq2_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 23:
-        mul_mat_vec_iq4_xs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-      case 29:
-        mul_mat_vec_iq1_m_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-        break;
-    }
-  });
-  return Y;
-}
-
-torch::Tensor ggml_mul_mat_a8(torch::Tensor W,  // quant weight
-                              torch::Tensor X,  // input
-                              int64_t type, int64_t row) {
-  int col = X.sizes()[1];
-  int padded = (col + 512 - 1) / 512 * 512;
-  int batch = X.sizes()[0];
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
-  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
-  at::Tensor Y = torch::empty({batch, row}, options);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
-  at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options);
-  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_a8", [&] {
-    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
-                           col, batch, stream);
-
-    switch (type) {
-      case 2:
-        ggml_mul_mat_q4_0_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 3:
-        ggml_mul_mat_q4_1_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 6:
-        ggml_mul_mat_q5_0_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 7:
-        ggml_mul_mat_q5_1_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 8:
-        ggml_mul_mat_q8_0_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 10:
-        ggml_mul_mat_q2_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 11:
-        ggml_mul_mat_q3_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 12:
-        ggml_mul_mat_q4_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 13:
-        ggml_mul_mat_q5_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 14:
-        ggml_mul_mat_q6_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-    }
-  });
-  return Y;
-}
-
-torch::Tensor ggml_moe_a8(torch::Tensor X,  // input
-                          torch::Tensor W,  // expert weights
-                          torch::Tensor sorted_token_ids,
-                          torch::Tensor expert_ids,
-                          torch::Tensor num_tokens_post_padded, int64_t type,
-                          int64_t row, int64_t top_k, int64_t tokens) {
-  int col = X.sizes()[1];
-  int padded = (col + 512 - 1) / 512 * 512;
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
-  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
-  at::Tensor Y = torch::empty({tokens * top_k, row}, options);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
-  at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options);
-  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_a8", [&] {
-    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
-                           col, tokens, stream);
-    switch (type) {
-      case 2:
-        ggml_moe_q4_0_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 3:
-        ggml_moe_q4_1_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 6:
-        ggml_moe_q5_0_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 7:
-        ggml_moe_q5_1_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 8:
-        ggml_moe_q8_0_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 10:
-        ggml_moe_q2_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 11:
-        ggml_moe_q3_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 12:
-        ggml_moe_q4_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 13:
-        ggml_moe_q5_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 14:
-        ggml_moe_q6_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-    }
-  });
-  return Y;
-}
-
-torch::Tensor ggml_moe_a8_vec(torch::Tensor X,  // input
-                              torch::Tensor W,  // expert weights
-                              torch::Tensor topk_ids, int64_t top_k,
-                              int64_t type, int64_t row, int64_t tokens) {
-  int col = X.sizes()[1];
-  const int padded = (col + 512 - 1) / 512 * 512;
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
-  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
-  at::Tensor Y = torch::zeros({tokens * top_k, row}, options);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
-  at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options);
-  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_vec_a8", [&] {
-    quantize_row_q8_1_cuda<scalar_t>((scalar_t*)X.data_ptr(),
-                                     (void*)quant_X.data_ptr(), col, tokens,
-                                     stream);
-    switch (type) {
-      case 2:
-        moe_vec_q4_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 3:
-        moe_vec_q4_1_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 6:
-        moe_vec_q5_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 7:
-        moe_vec_q5_1_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 8:
-        moe_vec_q8_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 10:
-        moe_vec_q2_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 11:
-        moe_vec_q3_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 12:
-        moe_vec_q4_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 13:
-        moe_vec_q5_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 14:
-        moe_vec_q6_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 16:
-        moe_vec_iq2_xxs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 17:
-        moe_vec_iq2_xs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 18:
-        moe_vec_iq3_xxs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 19:
-        moe_vec_iq1_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 20:
-        moe_vec_iq4_nl_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 21:
-        moe_vec_iq3_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 22:
-        moe_vec_iq2_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 23:
-        moe_vec_iq4_xs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 29:
-        moe_vec_iq1_m_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-    }
-  });
-  return Y;
-}
-
-int64_t ggml_moe_get_block_size(int64_t type) {
-  switch (type) {
-    case 2:
-      return MOE_X_Q4_0;
-    case 3:
-      return MOE_X_Q4_1;
-    case 6:
-      return MOE_X_Q5_0;
-    case 7:
-      return MOE_X_Q5_1;
-    case 8:
-      return MOE_X_Q8_0;
-    case 10:
-      return MOE_X_Q2_K;
-    case 11:
-      return MOE_X_Q3_K;
-    case 12:
-      return MOE_X_Q4_K;
-    case 13:
-      return MOE_X_Q5_K;
-    case 14:
-      return MOE_X_Q6_K;
-  }
-  return 0;
-}
diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh
deleted file mode 100644
index 7c89918c23d8..000000000000
--- a/csrc/quantization/gguf/mmq.cuh
+++ /dev/null
@@ -1,610 +0,0 @@
-// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
-template <typename scalar_t, int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
-              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
-static __device__ __forceinline__ void mul_mat_q(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    const int blocks_per_row_x = ncols_x / qk;
-    const int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_warp = WARP_SIZE_GGUF / qi;
-
-    const int & ncols_dst = ncols_y;
-
-    const auto row_dst_0 = blockIdx.x*mmq_y;
-    const int & row_x_0 = row_dst_0;
-
-    const auto col_dst_0 = blockIdx.y*mmq_x;
-    const int & col_y_0 = col_dst_0;
-
-    int   * tile_x_ql = nullptr;
-    half2 * tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
-    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE_GGUF];
-    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE_GGUF/QI8_1];
-
-    float sum[mmq_y/WARP_SIZE_GGUF][mmq_x/nwarps] = {{0.0f}};
-
-    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-
-        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
-
-#pragma unroll
-        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
-            const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
-            const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-            for (int i = 0; i < mmq_x; i += nwarps) {
-                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
-                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-                const int index_y = (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
-                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
-            }
-
-#pragma unroll
-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
-                const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
-                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
-
-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE_GGUF/QI8_1) + kby].ds;
-                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE_GGUF/QI8_1) + kby];
-                if (need_sum) {
-                    *dsi_dst = *dsi_src;
-                } else {
-                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = __low2float(*dsi_src);
-                }
-            }
-
-            __syncthreads();
-
-// #pragma unroll // unrolling this loop causes too much register pressure
-            for (int k = ir*WARP_SIZE_GGUF/qr; k < (ir+1)*WARP_SIZE_GGUF/qr; k += vdr) {
-#pragma unroll
-                for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-                    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-                        sum[i/WARP_SIZE_GGUF][j/nwarps] += vec_dot(
-                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
-                            threadIdx.x + i, threadIdx.y + j, k);
-                    }
-                }
-            }
-            __syncthreads();
-        }
-    }
-
-#pragma unroll
-    for (int j = 0; j < mmq_x; j += nwarps) {
-        const auto col_dst = col_dst_0 + j + threadIdx.y;
-        if (col_dst >= ncols_dst) {
-            return;
-        }
-
-#pragma unroll
-        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-            const auto row_dst = row_dst_0 + threadIdx.x + i;
-            if (row_dst >= nrows_dst) {
-                continue;
-            }
-            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE_GGUF][j/nwarps];
-        }
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q4_0  64
-#define  MMQ_Y_Q4_0  128
-#define NWARPS_Q4_0  8
-#else
-#define  MMQ_X_Q4_0 4
-#define  MMQ_Y_Q4_0 32
-#define NWARPS_Q4_0 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2)
-#endif
-mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q4_0;
-    const int mmq_y  =  MMQ_Y_Q4_0;
-    const int nwarps = NWARPS_Q4_0;
-
-    mul_mat_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q4_0_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int mmq_x  =  MMQ_X_Q4_0;
-    int mmq_y  =  MMQ_Y_Q4_0;
-    int nwarps = NWARPS_Q4_0;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q4_1 64
-#define  MMQ_Y_Q4_1 128
-#define NWARPS_Q4_1 8
-#else
-#define  MMQ_X_Q4_1 4
-#define  MMQ_Y_Q4_1 32
-#define NWARPS_Q4_1 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2)
-#endif
-mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q4_1;
-    const int mmq_y  =  MMQ_Y_Q4_1;
-    const int nwarps = NWARPS_Q4_1;
-
-    mul_mat_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q4_1_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int mmq_x  =  MMQ_X_Q4_1;
-    int mmq_y  =  MMQ_Y_Q4_1;
-    int nwarps = NWARPS_Q4_1;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q5_0 64
-#define  MMQ_Y_Q5_0 128
-#define NWARPS_Q5_0 8
-#else
-#define  MMQ_X_Q5_0 4
-#define  MMQ_Y_Q5_0 32
-#define NWARPS_Q5_0 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2)
-#endif
-mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q5_0;
-    const int mmq_y  =  MMQ_Y_Q5_0;
-    const int nwarps = NWARPS_Q5_0;
-
-    mul_mat_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q5_0_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    const int mmq_x  =  MMQ_X_Q5_0;
-    const int mmq_y  =  MMQ_Y_Q5_0;
-    const int nwarps = NWARPS_Q5_0;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q5_1 64
-#define  MMQ_Y_Q5_1 128
-#define NWARPS_Q5_1 8
-#else
-#define  MMQ_X_Q5_1 4
-#define  MMQ_Y_Q5_1 32
-#define NWARPS_Q5_1 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2)
-#endif
-mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q5_1;
-    const int mmq_y  =  MMQ_Y_Q5_1;
-    const int nwarps = NWARPS_Q5_1;
-
-    mul_mat_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q5_1_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-    const int mmq_x  =  MMQ_X_Q5_1;
-    const int mmq_y  =  MMQ_Y_Q5_1;
-    const int nwarps = NWARPS_Q5_1;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q8_0 64
-#define  MMQ_Y_Q8_0 128
-#define NWARPS_Q8_0 8
-#else
-#define  MMQ_X_Q8_0 4
-#define  MMQ_Y_Q8_0 32
-#define NWARPS_Q8_0 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2)
-#endif
-mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q8_0;
-    const int mmq_y  =  MMQ_Y_Q8_0;
-    const int nwarps = NWARPS_Q8_0;
-
-    mul_mat_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q8_0_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-    const int mmq_x  =  MMQ_X_Q8_0;
-    const int mmq_y  =  MMQ_Y_Q8_0;
-    const int nwarps = NWARPS_Q8_0;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q2_K 64
-#define  MMQ_Y_Q2_K 128
-#define NWARPS_Q2_K 8
-#else
-#define  MMQ_X_Q2_K 4
-#define  MMQ_Y_Q2_K 32
-#define NWARPS_Q2_K 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2)
-#endif
-mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q2_K;
-    const int mmq_y  =  MMQ_Y_Q2_K;
-    const int nwarps = NWARPS_Q2_K;
-
-    mul_mat_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q2_K_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-    const int mmq_x  =  MMQ_X_Q2_K;
-    const int mmq_y  =  MMQ_Y_Q2_K;
-    const int nwarps = NWARPS_Q2_K;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q3_K 64
-#define  MMQ_Y_Q3_K 128
-#define NWARPS_Q3_K 8
-#else
-#define  MMQ_X_Q3_K 4
-#define  MMQ_Y_Q3_K 32
-#define NWARPS_Q3_K 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2)
-#endif
-mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-    const int mmq_x  =  MMQ_X_Q3_K;
-    const int mmq_y  =  MMQ_Y_Q3_K;
-    const int nwarps = NWARPS_Q3_K;
-
-    mul_mat_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q3_K_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    const int mmq_x  =  MMQ_X_Q3_K;
-    const int mmq_y  =  MMQ_Y_Q3_K;
-    const int nwarps = NWARPS_Q3_K;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q4_K 64
-#define  MMQ_Y_Q4_K 128
-#define NWARPS_Q4_K 8
-#else
-#define  MMQ_X_Q4_K 4
-#define  MMQ_Y_Q4_K 32
-#define NWARPS_Q4_K 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2)
-#endif
-mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q4_K;
-    const int mmq_y  =  MMQ_Y_Q4_K;
-    const int nwarps = NWARPS_Q4_K;
-
-    mul_mat_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q4_K_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-    const int mmq_x  =  MMQ_X_Q4_K;
-    const int mmq_y  =  MMQ_Y_Q4_K;
-    const int nwarps = NWARPS_Q4_K;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q5_K 64
-#define  MMQ_Y_Q5_K 128
-#define NWARPS_Q5_K 8
-#else
-#define  MMQ_X_Q5_K 4
-#define  MMQ_Y_Q5_K 32
-#define NWARPS_Q5_K 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2)
-#endif
-mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q5_K;
-    const int mmq_y  =  MMQ_Y_Q5_K;
-    const int nwarps = NWARPS_Q5_K;
-
-    mul_mat_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q5_K_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    const int mmq_x  =  MMQ_X_Q5_K;
-    const int mmq_y  =  MMQ_Y_Q5_K;
-    const int nwarps = NWARPS_Q5_K;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-#if defined(USE_ROCM)
-#define  MMQ_X_Q6_K 64
-#define  MMQ_Y_Q6_K 128
-#define NWARPS_Q6_K 8
-#else
-#define  MMQ_X_Q6_K 4
-#define  MMQ_Y_Q6_K 32
-#define NWARPS_Q6_K 4
-#endif
-
-template<typename scalar_t, bool need_check> static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2)
-#endif
-mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-    const int mmq_x  =  MMQ_X_Q6_K;
-    const int mmq_y  =  MMQ_Y_Q6_K;
-    const int nwarps = NWARPS_Q6_K;
-
-    mul_mat_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-}
-
-template<typename scalar_t>
-static void ggml_mul_mat_q6_K_q8_1_cuda(
-    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-    const int mmq_x  =  MMQ_X_Q6_K;
-    const int mmq_y  =  MMQ_Y_Q6_K;
-    const int nwarps = NWARPS_Q6_K;
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh
deleted file mode 100644
index e27bec7af5b7..000000000000
--- a/csrc/quantization/gguf/mmvq.cuh
+++ /dev/null
@@ -1,212 +0,0 @@
-// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
-template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows, const int nvecs) {
-    const auto row = blockIdx.x*blockDim.y + threadIdx.y;
-    const auto vec = blockIdx.y;
-
-    if (row >= nrows || vec >= nvecs) {
-        return;
-    }
-
-    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-    const int nrows_y = (ncols + 512 - 1) / 512 * 512;
-
-
-    // partial sum for each thread
-    float tmp = 0.0f;
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i; // x block index
-
-        const int iby = vec*(nrows_y/QK8_1) + i * (qk/QK8_1); // y block index that aligns with ibx
-
-        const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
-
-        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
-        tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[vec*nrows + row] = tmp;
-    }
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
-
-template<typename scalar_t>
-static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, nvecs, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
-}
diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh
deleted file mode 100644
index df9b84abcc13..000000000000
--- a/csrc/quantization/gguf/moe.cuh
+++ /dev/null
@@ -1,739 +0,0 @@
-#include <cstdint>
-
-/* Adapted from ./csrc/quantization/gguf/mmq.cuh
-   based on ./vllm/model_executor/layers/fused_moe/fused_moe.py */
-template <typename scalar_t, int qk, int qr, int qi, bool need_sum,
-          typename block_q_t, int mmq_x, int mmq_y, int nwarps,
-          allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles,
-          int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
-static __device__ __forceinline__ void moe_q(
-    const void* __restrict__ vx, const void* __restrict__ vy,
-    scalar_t* __restrict__ dst, const int* __restrict__ sorted_token_ids,
-    const int* __restrict__ expert_ids,
-    const int* __restrict__ num_tokens_post_padded, const int exp_stride,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y,
-    const int nrows_dst, const int top_k) {
-  const int blocks_per_row_x = ncols_x / qk;
-  const int blocks_per_col_y = nrows_y / QK8_1;
-  const int blocks_per_warp = WARP_SIZE_GGUF / qi;
-
-  const int ncols_dst = ncols_y * top_k;
-
-  const auto row_dst_0 = blockIdx.x * mmq_y;
-  const int& row_x_0 = row_dst_0;
-
-  const auto col_dst_0 = blockIdx.y * mmq_x;
-
-  int token_offs[mmq_x / nwarps];
-  for (int i = 0; i < mmq_x; i += nwarps) {
-    token_offs[i / nwarps] = sorted_token_ids[col_dst_0 + threadIdx.y + i];
-  }
-
-  const int exp_idx = expert_ids[blockIdx.y];
-  if (exp_idx > 255 || exp_idx < 0) return;
-  if (blockIdx.y * mmq_x > num_tokens_post_padded[0]) return;
-
-  const block_q_t* x = (const block_q_t*)((char*)vx + exp_idx * exp_stride);
-  const block_q8_1* y = (const block_q8_1*)(vy);
-
-  int* tile_x_ql = nullptr;
-  half2* tile_x_dm = nullptr;
-  int* tile_x_qh = nullptr;
-  int* tile_x_sc = nullptr;
-
-  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
-  __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF];
-  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF / QI8_1];
-
-  float sum[mmq_y / WARP_SIZE_GGUF][mmq_x / nwarps] = {{0.0f}};
-
-  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-    load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
-               tile_x_qh, tile_x_sc, threadIdx.y, nrows_x - row_x_0 - 1,
-               threadIdx.x, blocks_per_row_x);
-
-    const int n_per_r = ((qk * blocks_per_warp) / qr);
-#pragma unroll
-    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
-      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
-      const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-      for (int i = 0; i < mmq_x; i += nwarps) {
-        const int col_y_eff = token_offs[i / nwarps] / top_k;
-        const int block_x = ib0 * (qk / QK8_1) + kbxd;
-        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
-          const block_q8_1* by0 = &y[col_y_eff * blocks_per_col_y + block_x];
-          const int index_y =
-              (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
-          tile_y_qs[index_y] =
-              get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
-        }
-      }
-
-      if (threadIdx.x < n_per_r / QK8_1) {
-        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
-        const int col_y_eff = token_offs[threadIdx.y] / top_k;
-        const int block_x =
-            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
-
-        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
-          const half2* dsi_src = &y[col_y_eff * blocks_per_col_y + block_x].ds;
-          half2* dsi_dst =
-              &tile_y_ds[threadIdx.y * (WARP_SIZE_GGUF / QI8_1) + kby];
-
-          if (need_sum) {
-            *dsi_dst = *dsi_src;
-          } else {
-            float* dfi_dst = (float*)dsi_dst;
-            *dfi_dst = __low2float(*dsi_src);
-          }
-        }
-      }
-      __syncthreads();
-
-      // #pragma unroll // unrolling this loop causes too much register pressure
-      for (int k = ir * WARP_SIZE_GGUF / qr; k < (ir + 1) * WARP_SIZE_GGUF / qr;
-           k += vdr) {
-#pragma unroll
-        for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-          for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-            sum[i / WARP_SIZE_GGUF][j / nwarps] +=
-                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs,
-                        tile_y_ds, threadIdx.x + i, threadIdx.y + j, k);
-          }
-        }
-      }
-      __syncthreads();
-    }
-  }
-
-#pragma unroll
-  for (int j = 0; j < mmq_x; j += nwarps) {
-    const int col_dst = token_offs[j / nwarps];
-    if (col_dst >= ncols_dst) {
-      return;
-    }
-
-#pragma unroll
-    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-      const auto row_dst = row_dst_0 + threadIdx.x + i;
-      if (row_dst >= nrows_dst) {
-        continue;
-      }
-      dst[col_dst * nrows_dst + row_dst] = sum[i / WARP_SIZE_GGUF][j / nwarps];
-    }
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q4_0 8
-  #define MOE_Y_Q4_0 128
-  #define NWARPS_Q4_0 8
-#else
-  #define MOE_X_Q4_0 4
-  #define MOE_Y_Q4_0 32
-  #define NWARPS_Q4_0 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
-#endif
-    moe_q4_0(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q4_0;
-  const int mmq_y = MOE_Y_Q4_0;
-  const int nwarps = NWARPS_Q4_0;
-
-  moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q4_0<mmq_y>, load_tiles_q4_0<mmq_y, nwarps, need_check>,
-        VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q4_0_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MOE_X_Q4_0;
-  int mmq_y = MOE_Y_Q4_0;
-  int nwarps = NWARPS_Q4_0;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q4_1 8
-  #define MOE_Y_Q4_1 128
-  #define NWARPS_Q4_1 8
-#else
-  #define MOE_X_Q4_1 4
-  #define MOE_Y_Q4_1 32
-  #define NWARPS_Q4_1 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
-#endif
-    moe_q4_1(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q4_1;
-  const int mmq_y = MOE_Y_Q4_1;
-  const int nwarps = NWARPS_Q4_1;
-
-  moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q4_1<mmq_y>, load_tiles_q4_1<mmq_y, nwarps, need_check>,
-        VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q4_1_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MOE_X_Q4_1;
-  int mmq_y = MOE_Y_Q4_1;
-  int nwarps = NWARPS_Q4_1;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q5_0 8
-  #define MOE_Y_Q5_0 128
-  #define NWARPS_Q5_0 8
-#else
-  #define MOE_X_Q5_0 4
-  #define MOE_Y_Q5_0 32
-  #define NWARPS_Q5_0 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
-#endif
-    moe_q5_0(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q5_0;
-  const int mmq_y = MOE_Y_Q5_0;
-  const int nwarps = NWARPS_Q5_0;
-
-  moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q5_0<mmq_y>, load_tiles_q5_0<mmq_y, nwarps, need_check>,
-        VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q5_0_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q5_0;
-  const int mmq_y = MOE_Y_Q5_0;
-  const int nwarps = NWARPS_Q5_0;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q5_1 8
-  #define MOE_Y_Q5_1 128
-  #define NWARPS_Q5_1 8
-#else
-  #define MOE_X_Q5_1 4
-  #define MOE_Y_Q5_1 32
-  #define NWARPS_Q5_1 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
-#endif
-    moe_q5_1(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q5_1;
-  const int mmq_y = MOE_Y_Q5_1;
-  const int nwarps = NWARPS_Q5_1;
-
-  moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q5_1<mmq_y>, load_tiles_q5_1<mmq_y, nwarps, need_check>,
-        VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q5_1_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q5_1;
-  const int mmq_y = MOE_Y_Q5_1;
-  const int nwarps = NWARPS_Q5_1;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q8_0 8
-  #define MOE_Y_Q8_0 128
-  #define NWARPS_Q8_0 8
-#else
-  #define MOE_X_Q8_0 4
-  #define MOE_Y_Q8_0 32
-  #define NWARPS_Q8_0 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
-#endif
-    moe_q8_0(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q8_0;
-  const int mmq_y = MOE_Y_Q8_0;
-  const int nwarps = NWARPS_Q8_0;
-
-  moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q8_0<mmq_y>, load_tiles_q8_0<mmq_y, nwarps, need_check>,
-        VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q8_0_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q8_0;
-  const int mmq_y = MOE_Y_Q8_0;
-  const int nwarps = NWARPS_Q8_0;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q2_K 8
-  #define MOE_Y_Q2_K 128
-  #define NWARPS_Q2_K 8
-#else
-  #define MOE_X_Q2_K 4
-  #define MOE_Y_Q2_K 32
-  #define NWARPS_Q2_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
-#endif
-    moe_q2_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q2_K;
-  const int mmq_y = MOE_Y_Q2_K;
-  const int nwarps = NWARPS_Q2_K;
-
-  moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q2_K<mmq_y>, load_tiles_q2_K<mmq_y, nwarps, need_check>,
-        VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q2_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q2_K;
-  const int mmq_y = MOE_Y_Q2_K;
-  const int nwarps = NWARPS_Q2_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q3_K 8
-  #define MOE_Y_Q3_K 128
-  #define NWARPS_Q3_K 8
-#else
-  #define MOE_X_Q3_K 4
-  #define MOE_Y_Q3_K 32
-  #define NWARPS_Q3_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
-#endif
-    moe_q3_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-
-  const int mmq_x = MOE_X_Q3_K;
-  const int mmq_y = MOE_Y_Q3_K;
-  const int nwarps = NWARPS_Q3_K;
-
-  moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q3_K<mmq_y>, load_tiles_q3_K<mmq_y, nwarps, need_check>,
-        VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-template <typename scalar_t>
-static void ggml_moe_q3_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q3_K;
-  const int mmq_y = MOE_Y_Q3_K;
-  const int nwarps = NWARPS_Q3_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q4_K 8
-  #define MOE_Y_Q4_K 128
-  #define NWARPS_Q4_K 8
-#else
-  #define MOE_X_Q4_K 4
-  #define MOE_Y_Q4_K 32
-  #define NWARPS_Q4_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
-#endif
-    moe_q4_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q4_K;
-  const int mmq_y = MOE_Y_Q4_K;
-  const int nwarps = NWARPS_Q4_K;
-
-  moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q4_K<mmq_y>, load_tiles_q4_K<mmq_y, nwarps, need_check>,
-        VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q4_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q4_K;
-  const int mmq_y = MOE_Y_Q4_K;
-  const int nwarps = NWARPS_Q4_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q5_K 8
-  #define MOE_Y_Q5_K 128
-  #define NWARPS_Q5_K 8
-#else
-  #define MOE_X_Q5_K 4
-  #define MOE_Y_Q5_K 32
-  #define NWARPS_Q5_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
-#endif
-    moe_q5_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q5_K;
-  const int mmq_y = MOE_Y_Q5_K;
-  const int nwarps = NWARPS_Q5_K;
-
-  moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q5_K<mmq_y>, load_tiles_q5_K<mmq_y, nwarps, need_check>,
-        VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q5_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q5_K;
-  const int mmq_y = MOE_Y_Q5_K;
-  const int nwarps = NWARPS_Q5_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q6_K 8
-  #define MOE_Y_Q6_K 128
-  #define NWARPS_Q6_K 8
-#else
-  #define MOE_X_Q6_K 4
-  #define MOE_Y_Q6_K 32
-  #define NWARPS_Q6_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
-#endif
-    moe_q6_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q6_K;
-  const int mmq_y = MOE_Y_Q6_K;
-  const int nwarps = NWARPS_Q6_K;
-
-  moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q6_K<mmq_y>, load_tiles_q6_K<mmq_y, nwarps, need_check>,
-        VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q6_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q6_K;
-  const int mmq_y = MOE_Y_Q6_K;
-  const int nwarps = NWARPS_Q6_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
diff --git a/csrc/quantization/gguf/moe_vec.cuh b/csrc/quantization/gguf/moe_vec.cuh
deleted file mode 100644
index 60f65a1bfdcb..000000000000
--- a/csrc/quantization/gguf/moe_vec.cuh
+++ /dev/null
@@ -1,338 +0,0 @@
-// copied and adapted from
-// https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
-template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr,
-          vec_dot_q_cuda_t vec_dot_q_cuda>
-static __global__ void moe_vec_q(const void* __restrict__ vx,
-                                 const void* __restrict__ vy,
-                                 scalar_t* __restrict__ dst,
-                                 const int* topk_ids, const int topk,
-                                 const int ncols, const int nrows,
-                                 const int token_stride) {
-  const auto row = blockIdx.x * blockDim.y + threadIdx.y;
-
-  const auto token = blockIdx.z / topk;
-  const auto expert = (topk_ids)[blockIdx.z];
-
-  if (row >= nrows) {
-    return;
-  }
-
-  const int blocks_per_row = ncols / qk;
-  const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
-  // partial sum for each thread
-  float tmp = 0.0f;
-
-  const block_q_t* x = ((const block_q_t*)vx) + expert * nrows * blocks_per_row;
-  const block_q8_1* y =
-      (const block_q8_1*)(((const int*)vy) + token * token_stride);
-
-  for (auto i = threadIdx.x / (qi / vdr); i < blocks_per_row;
-       i += blocks_per_warp) {
-    const int ibx = row * blocks_per_row + i;  // x block index
-
-    const int iby = i * (qk / QK8_1);  // y block index that aligns with ibx
-
-    const int iqs =
-        vdr *
-        (threadIdx.x %
-         (qi / vdr));  // x block quant index when casting the quants to int
-
-    tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
-  }
-
-  // sum up partial sums and write back result
-#pragma unroll
-  for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
-    tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
-  }
-
-  if (threadIdx.x == 0) {
-    dst[blockIdx.z * nrows + row] = tmp;
-  }
-}
-
-template <typename scalar_t>
-static void moe_vec_q4_0_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
-            vec_dot_q4_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q4_1_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
-            vec_dot_q4_1_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q5_0_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
-            vec_dot_q5_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q5_1_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
-            vec_dot_q5_1_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q8_0_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
-            vec_dot_q8_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q2_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
-            vec_dot_q2_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q3_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
-            vec_dot_q3_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q4_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
-            vec_dot_q4_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q5_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
-            vec_dot_q5_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q6_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
-            vec_dot_q6_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq2_xxs_q8_1_cuda(const void* vx, const void* vy,
-                                      scalar_t* dst, const int* topk_ids,
-                                      const int top_k, const int tokens,
-                                      const int ncols, const int nrows,
-                                      const int token_stride,
-                                      cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq2_xs_q8_1_cuda(const void* vx, const void* vy,
-                                     scalar_t* dst, const int* topk_ids,
-                                     const int top_k, const int tokens,
-                                     const int ncols, const int nrows,
-                                     const int token_stride,
-                                     cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq2_s_q8_1_cuda(const void* vx, const void* vy,
-                                    scalar_t* dst, const int* topk_ids,
-                                    const int top_k, const int tokens,
-                                    const int ncols, const int nrows,
-                                    const int token_stride,
-                                    cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq3_xxs_q8_1_cuda(const void* vx, const void* vy,
-                                      scalar_t* dst, const int* topk_ids,
-                                      const int top_k, const int tokens,
-                                      const int ncols, const int nrows,
-                                      const int token_stride,
-                                      cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq1_s_q8_1_cuda(const void* vx, const void* vy,
-                                    scalar_t* dst, const int* topk_ids,
-                                    const int top_k, const int tokens,
-                                    const int ncols, const int nrows,
-                                    const int token_stride,
-                                    cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq1_m_q8_1_cuda(const void* vx, const void* vy,
-                                    scalar_t* dst, const int* topk_ids,
-                                    const int top_k, const int tokens,
-                                    const int ncols, const int nrows,
-                                    const int token_stride,
-                                    cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq4_nl_q8_1_cuda(const void* vx, const void* vy,
-                                     scalar_t* dst, const int* topk_ids,
-                                     const int top_k, const int tokens,
-                                     const int ncols, const int nrows,
-                                     const int token_stride,
-                                     cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ,
-            vec_dot_iq4_nl_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq4_xs_q8_1_cuda(const void* vx, const void* vy,
-                                     scalar_t* dst, const int* topk_ids,
-                                     const int top_k, const int tokens,
-                                     const int ncols, const int nrows,
-                                     const int token_stride,
-                                     cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq3_s_q8_1_cuda(const void* vx, const void* vy,
-                                    scalar_t* dst, const int* topk_ids,
-                                    const int top_k, const int tokens,
-                                    const int ncols, const int nrows,
-                                    const int token_stride,
-                                    cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
diff --git a/csrc/quantization/gguf/vecdotq.cuh b/csrc/quantization/gguf/vecdotq.cuh
deleted file mode 100644
index d0d4c74ed379..000000000000
--- a/csrc/quantization/gguf/vecdotq.cuh
+++ /dev/null
@@ -1,1812 +0,0 @@
-// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/vecdotq.cuh
-// and https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
-static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
-    const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment
-
-    int x32  = x16[2*i32 + 0] <<  0;
-    x32     |= x16[2*i32 + 1] << 16;
-
-    return x32;
-}
-
-static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
-    return ((const int *) x)[i32]; // assume at least 4 byte alignment
-}
-
-static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-    return x32;
-}
-
-static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-    return x32;
-}
-
-static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
-// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
-
-#define VDR_Q4_0_Q8_1_MMVQ 2
-#define VDR_Q4_0_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
-    const int * v, const int * u, const float & d4, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // second part effectively subtracts 8 from each quant value
-    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
-#endif
-}
-
-#define VDR_Q4_1_Q8_1_MMVQ 2
-#define VDR_Q4_1_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
-    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
-    }
-
-    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
-    const float d4d8 = tmp.x;
-    const float m4s8 = tmp.y;
-
-    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
-    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
-#endif
-}
-
-#define VDR_Q5_0_Q8_1_MMVQ 2
-#define VDR_Q5_0_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
-    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // second part effectively subtracts 16 from each quant value
-    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
-#endif
-}
-
-
-#define VDR_Q5_1_Q8_1_MMVQ 2
-#define VDR_Q5_1_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
-    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-    }
-
-    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
-    const float d5d8 = tmp.x;
-    const float m5s8 = tmp.y;
-
-    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
-    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-#endif
-}
-
-#define VDR_Q8_0_Q8_1_MMVQ 2
-#define VDR_Q8_0_Q8_1_MMQ 8
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
-    const int * v, const int * u, const float & d8_0, const float & d8_1) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
-    }
-    return d8_0*d8_1 * sumi;
-#endif
-}
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
-    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
-    }
-
-    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
-    const float d8d8 = tmp.x;
-    const float m8s8 = tmp.y;
-
-    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
-    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-#endif
-}
-
-#define VDR_Q2_K_Q8_1_MMVQ 1
-#define VDR_Q2_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
-    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const half2 & dm2, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++i) {
-        const int sc = scales[2*i];
-
-        const int vi = (v >> (2*i)) & 0x03030303;
-
-        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
-    }
-
-    const float2 dm2f = __half22float2(dm2);
-
-    return dm2f.x*sumf_d - dm2f.y*sumf_m;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const half2 & dm2, const float & d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    int sumi_d = 0;
-    int sumi_m = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
-        int sumi_d_sc = 0;
-
-        const int sc = scales[i0 / (QI8_1/2)];
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-
-#pragma unroll
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
-            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
-        }
-
-        sumi_d += sumi_d_sc * (sc & 0xF);
-    }
-
-    const float2 dm2f = __half22float2(dm2);
-
-    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
-#endif
-}
-
-#define VDR_Q3_K_Q8_1_MMVQ 1
-#define VDR_Q3_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
-    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        const int isc = scale_offset + 2*i;
-
-        const int isc_low = isc % (QK_K/32);
-        const int sc_shift_low = 4 * (isc / (QK_K/32));
-        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
-
-        const int isc_high = isc % (QK_K/64);
-        const int sc_shift_high = 2 * (isc / (QK_K/64));
-        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
-
-        const int sc = (sc_low | sc_high) - 32;
-
-        const int vil = (vl >> (2*i)) & 0x03030303;
-
-        const int vih = ((vh >> i) << 2) & 0x04040404;
-
-        const int vi = __vsubss4(vil, vih);
-
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d3 * sumf;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
-    const float & d3, const float & d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    int sumi = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
-        int sumi_sc = 0;
-
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
-        }
-
-        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
-    }
-
-    return d3*d8 * sumi;
-#endif
-}
-
-#define VDR_Q4_K_Q8_1_MMVQ 2
-#define VDR_Q4_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K; ++i) {
-        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
-        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const float2 ds8f = __half22float2(ds8[i]);
-
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-#endif
-}
-
-#define VDR_Q5_K_Q8_1_MMVQ 2
-#define VDR_Q5_K_Q8_1_MMQ  8
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
-    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
-        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
-        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
-
-        const int v0i = vl0i | vh0i;
-        const int v1i = vl1i | vh1i;
-
-        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);
-    }
-
-    const float2 dm5f = __half22float2(dm5);
-    return dm5f.x*sumf_d - dm5f.y*sumf_m;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const float2 ds8f = __half22float2(ds8[i]);
-
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-#endif
-}
-
-#define VDR_Q6_K_Q8_1_MMVQ 1
-#define VDR_Q6_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
-    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
-    const float & d, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        const int sc = scales[4*i];
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
-        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
-
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
-    const float & d6, const float * __restrict__ d8) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    float sumf_d = 0.0f;
-
-#pragma unroll
-    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
-        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
-
-#pragma unroll
-        for (int i = i0; i < i0 + 2; ++i) {
-            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
-            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
-
-            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
-            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
-        }
-
-        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
-    }
-
-    return d6 * sumf_d;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
-
-    int v[VDR_Q4_0_Q8_1_MMVQ];
-    int u[2*VDR_Q4_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
-        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, __half2float(bq4_0->d), bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI4_0) + mmq_y/QI4_0];
-    *x_ql = tile_x_qs;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI4_0;
-    const int kqsx = k % QI4_0;
-
-    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        // x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
-        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d);
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const float * x_dmf = (const float *) x_dm;
-
-    int u[2*VDR_Q4_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
-        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_0) % WARP_SIZE_GGUF];
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-
-    int v[VDR_Q4_1_Q8_1_MMVQ];
-    int u[2*VDR_Q4_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
-        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE_GGUF) +     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_1) + mmq_y/QI4_1];
-    *x_ql = tile_x_qs;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI4_1;
-    const int kqsx = k % QI4_1;
-
-    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
-        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-
-    int u[2*VDR_Q4_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
-        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_1) % WARP_SIZE_GGUF];
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
-
-    int vl[VDR_Q5_0_Q8_1_MMVQ];
-    int vh[VDR_Q5_0_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
-        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
-        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
-    }
-
-    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, __half2float(bq5_0->d), bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI5_0) + mmq_y/QI5_0];
-
-    *x_ql = tile_x_ql;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI5_0;
-    const int kqsx = k % QI5_0;
-
-    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ql = get_int_from_uint8(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
-
-        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
-        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
-        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
-        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
-        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
-        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
-        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
-        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
-        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
-        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
-        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
-        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dmf[i * (WARP_SIZE_GGUF/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d);
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE_GGUF/QI5_0) + i/QI5_0 + k/QI5_0;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    int u[2*VDR_Q5_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
-        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_0) % WARP_SIZE_GGUF];
-    }
-
-    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-
-    int vl[VDR_Q5_1_Q8_1_MMVQ];
-    int vh[VDR_Q5_1_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
-        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
-        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
-    }
-
-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_1) + mmq_y/QI5_1];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI5_1;
-    const int kqsx = k % QI5_1;
-
-    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
-
-        int qs0 = (ql >>  0) & 0x0F0F0F0F;
-        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
-        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
-        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
-        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
-
-        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4) & 0x0F0F0F0F;
-        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
-        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
-        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
-        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
-
-        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
-        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE_GGUF/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE_GGUF/QI5_1) + + i/QI5_1 + k/QI5_1;
-
-    int u[2*VDR_Q5_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
-        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_1) % WARP_SIZE_GGUF];
-    }
-
-    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-
-    int v[VDR_Q8_0_Q8_1_MMVQ];
-    int u[VDR_Q8_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
-        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
-        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-    }
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, __half2float(bq8_0->d), __low2float(bq8_1->ds));
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI8_0) + mmq_y/QI8_0];
-
-    *x_ql = tile_x_qs;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI8_0;
-    const int kqsx = k % QI8_0;
-    float * x_dmf = (float *) x_dm;
-
-    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI8_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
-        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d);
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[j * WARP_SIZE_GGUF + k], x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE_GGUF/QI8_1) + k/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
-
-    const int bq8_offset = QR2_K * (iqs / QI8_1);
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const uint8_t * scales = bq2_K->scales + scale_offset;
-
-    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
-    int    u[QR2_K];
-    float d8[QR2_K];
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++ i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
-    }
-
-    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI2_K) + mmq_y/QI2_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4)     + mmq_y/4];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI2_K;
-    const int kqsx = k % QI2_K;
-
-    const block_q2_K * bx0 = (const block_q2_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI2_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
-        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI2_K/4);
-        x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    const int kbx = k / QI2_K;
-    const int ky  = (k % QI2_K) * QR2_K;
-    const float * y_df = (const float *) y_ds;
-
-    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-
-    const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
-    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-
-#pragma unroll
-    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
-        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
-    }
-
-    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4]) + ky/4;
-
-    const int index_y = j * WARP_SIZE_GGUF + (QR2_K*k) % WARP_SIZE_GGUF;
-    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
-
-    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const float d = __half2float(bq3_K->d);
-
-    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
-
-    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
-
-    int    u[QR3_K];
-    float d8[QR3_K];
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
-    }
-
-    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI3_K) + mmq_y/QI3_K];
-    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE_GGUF/2)     + mmq_y/2];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4)     + mmq_y/4];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_qh = tile_x_qh;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI3_K;
-    const int kqsx = k % QI3_K;
-
-    const block_q3_K * bx0 = (const block_q3_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI3_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
-        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d);
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-        int i = i0 + i_offset * 2 + k / (WARP_SIZE_GGUF/2);
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/2)) / (QI3_K/2);
-        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-        x_qh[i * (WARP_SIZE_GGUF/2) + i / 2 + k % (WARP_SIZE_GGUF/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4);
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI3_K/4);
-
-        const int ksc = k % (QI3_K/4);
-
-        const int ksc_low = ksc % (QI3_K/8);
-        const int shift_low = 4 * (ksc / (QI3_K/8));
-        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-
-        const int ksc_high = QI3_K/8;
-        const int shift_high = 2 * ksc;
-        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-
-        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
-
-        x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = sc;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-
-    const int kbx  = k / QI3_K;
-    const int ky  = (k % QI3_K) * QR3_K;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4)) + ky/4;
-
-    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-        const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
-        const int shift = 2 * ((ky % 32) / 8);
-        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-
-        const int vh = x_qh[i * (WARP_SIZE_GGUF/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
-        const int vlh = (vh << 2) & 0x04040404;
-
-        v[l] = __vsubss4(vll, vlh);
-    }
-
-    const int index_y = j * WARP_SIZE_GGUF + (k*QR3_K) % WARP_SIZE_GGUF;
-    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    int    v[2];
-    int    u[2*QR4_K];
-    float d8[QR4_K];
-
-    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
-    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-
-    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
-    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
-    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
-    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-
-    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    v[0] = q4[0];
-    v[1] = q4[4];
-
-    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-    for (int i = 0; i < QR4_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = __low2float(bq8i->ds);
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_K) + mmq_y/QI4_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI4_K; // == k if QK_K == 256
-
-    const block_q4_K * bx0 = (const block_q4_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
-        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-        if (need_check) {
-            i = min(i, i_max);
-        }
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI4_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE_GGUF/8);
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-
-    const int index_y = j * WARP_SIZE_GGUF + (QR4_K*k) % WARP_SIZE_GGUF;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    int   vl[2];
-    int   vh[2];
-    int    u[2*QR5_K];
-    float d8[QR5_K];
-
-    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
-    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
-
-    vl[0] = ql[0];
-    vl[1] = ql[4];
-
-    vh[0] = qh[0] >> bq8_offset;
-    vh[1] = qh[4] >> bq8_offset;
-
-    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = __low2float(bq8i->ds);
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_K) + mmq_y/QI5_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI5_K; // == k if QK_K == 256
-
-    const block_q5_K * bx0 = (const block_q5_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR5_K*kqsx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
-        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
-        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-
-        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
-        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-
-        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = ql0 | qh0;
-        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = ql1 | qh1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
-        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-        x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI5_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE_GGUF/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-
-    const int index_x = i * (QR5_K*WARP_SIZE_GGUF + 1) +  QR5_K*k;
-    const int index_y = j * WARP_SIZE_GGUF             + (QR5_K*k) % WARP_SIZE_GGUF;
-    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
-
-    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
-    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
-    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-
-    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
-    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
-
-    const int8_t * scales = bq6_K->scales + scale_offset;
-
-    int    u[QR6_K];
-    float d8[QR6_K];
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
-    }
-
-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, __half2float(bq6_K->d), d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI6_K) + mmq_y/QI6_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI6_K; // == k if QK_K == 256
-
-    const block_q6_K * bx0 = (const block_q6_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR6_K*kqsx;
-
-        const int ql = get_int_from_uint8(bxi->ql, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
-        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
-        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
-
-        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
-        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-
-        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
-        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI6_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
-        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d);
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / 4;
-
-        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + k % (WARP_SIZE_GGUF/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/8]);
-
-    const int index_x = i * (QR6_K*WARP_SIZE_GGUF + 1) +  QR6_K*k;
-    const int index_y = j * WARP_SIZE_GGUF             + (QR6_K*k) % WARP_SIZE_GGUF;
-    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
-
-    const int ib32 = iqs;
-    const uint16_t * q2 = bq2->qs + 4*ib32;
-    const uint8_t  * aux8 = (const uint8_t *)q2;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    uint32_t aux32 = q2[2] | (q2[3] << 16);
-    int sumi = 0;
-    for (int l = 0; l < 4; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
-        for (int j = 0; j < 8; ++j) {
-            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
-        q8 += 8;
-        aux32 >>= 7;
-    }
-    const float d = __half2float(bq2->d) * (0.5f + aux32) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
-    return d * sumi;
-}
-
-static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
-
-    const int ib32 = iqs;
-    const uint16_t * q2 = bq2->qs + 4*ib32;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
-    const uint8_t ls2 = bq2->scales[ib32] >>  4;
-    int sumi1 = 0;
-    for (int l = 0; l < 2; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-        for (int j = 0; j < 8; ++j) {
-            sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
-        q8 += 8;
-    }
-    int sumi2 = 0;
-    for (int l = 2; l < 4; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-        for (int j = 0; j < 8; ++j) {
-            sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
-        q8 += 8;
-    }
-    const float d = __half2float(bq2->d) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
-    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-}
-
-static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
-
-    const int ib32 = iqs;
-    const int8_t  * q8 = bq8_1[ib32].qs;
-    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
-    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
-    const uint8_t ls2 = bq2->scales[ib32] >>  4;
-    int sumi1 = 0;
-    for (int l = 0; l < 2; ++l) {
-        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
-        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
-        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
-        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
-        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
-        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
-        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
-        q8 += 8;
-    }
-    int sumi2 = 0;
-    for (int l = 2; l < 4; ++l) {
-        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
-        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
-        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
-        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
-        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
-        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
-        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
-        q8 += 8;
-    }
-    const float d = __half2float(bq2->d) * __low2float(bq8_1[ib32].ds) * 0.25f;
-    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
-
-    const int ib32 = iqs;
-    const uint8_t  * q3 = bq2->qs + 8*ib32;
-    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    uint32_t aux32 = gas[0] | (gas[1] << 16);
-    int sumi = 0;
-    for (int l = 0; l < 4; ++l) {
-        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
-        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
-        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
-        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
-        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
-        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
-        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
-        q8 += 8;
-        aux32 >>= 7;
-    }
-    const float d = __half2float(bq2->d) * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
-    return d * sumi;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
-
-    const int ib32 = iqs;
-    const uint8_t  * qs = bq2->qs + 8*ib32;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    int sumi = 0;
-    for (int l = 0; l < 4; ++l) {
-        const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
-        const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
-        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
-        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
-        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
-        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
-        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
-        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
-        q8 += 8;
-    }
-    const float d = __half2float(bq2->d) * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
-    return d * sumi;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
-
-    const int       qs_packed = get_int_b2(bq1->qs, iqs);
-    const uint8_t * qs        = (const uint8_t *) &qs_packed;
-
-    const int qh = bq1->qh[iqs];
-
-    int sumi = 0;
-#pragma unroll
-    for (int l0 = 0; l0 < 8; l0 += 2) {
-        const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
-
-        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
-        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
-
-        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
-        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
-
-        sumi = __dp4a(grid0, u0, sumi);
-        sumi = __dp4a(grid1, u1, sumi);
-    }
-
-    const float  d1q   = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
-    const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
-    const float2 ds    = __half22float2(bq8_1[iqs].ds);
-    return d1q * (ds.x*sumi + ds.y*delta);
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-
-    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
-
-    const int       qs_packed = get_int_b4(bq1->qs, iqs);
-    const uint8_t * qs        = (const uint8_t *) &qs_packed;
-
-    int   sumi[2] = {0};
-    float sumf[2] = {0.0f};
-#pragma unroll
-    for (int l0 = 0; l0 < 8; l0 += 2) {
-        const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
-
-        const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
-
-        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
-        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
-
-        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
-        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
-
-        sumi[l0/4] = __dp4a(grid0, u0, sumi[l0/4]);
-        sumi[l0/4] = __dp4a(grid1, u1, sumi[l0/4]);
-
-        const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
-        int sumy = 0;
-        sumy = __dp4a(u0, 0x01010101, sumy);
-        sumy = __dp4a(u1, 0x01010101, sumy);
-        sumf[l0/4] += delta*sumy;
-    }
-
-    const uint16_t * sc = (const uint16_t *) bq1->scales;
-
-    iq1m_scale_t scale;
-    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
-    const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
-
-    const int tmp = sc[iqs/2] >> (6*(iqs%2));
-    const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
-    const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
-    return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
-#endif
-}
-
-static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
-        int & val1, int & val2) {
-
-    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
-    aux32 = q4 & 0x0f0f0f0f;
-    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
-    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
-    val1 = v1 | (v2 << 16);
-    aux32 = (q4 >> 4) & 0x0f0f0f0f;
-    v1 = values[q8[0]] | (values[q8[1]] << 8);
-    v2 = values[q8[2]] | (values[q8[3]] << 8);
-    val2 = v1 | (v2 << 16);
-}
-
-static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-
-    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
-
-    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
-    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
-
-    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
-
-    int v1, v2;
-    int sumi1 = 0, sumi2 = 0;
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
-        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
-        get_int_from_table_16(aux, values, v1, v2);
-        sumi1 = __dp4a(v1, q8[l+0], sumi1);
-        sumi2 = __dp4a(v2, q8[l+4], sumi2);
-    }
-    const float d = __half2float(bq->d) * __low2float(bq8_1->ds);
-    return d * (sumi1 + sumi2);
-#endif
-}
-
-
-static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
-    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
-    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
-
-    // iqs is 0...7
-    const int ib32 = iqs;
-    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
-    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
-    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
-    const float d = __half2float(bq4->d) * (ls - 32) * __low2float(bq8_1[ib32].ds);
-    int v1, v2;
-    int sumi1 = 0, sumi2 = 0;
-    for (int j = 0; j < 4; ++j) {
-        get_int_from_table_16(q4[j], values, v1, v2);
-        sumi1 = __dp4a(v1, q8[j+0], sumi1);
-        sumi2 = __dp4a(v2, q8[j+4], sumi2);
-    }
-    return d * (sumi1 + sumi2);
-#endif
-}
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 27966e6808e9..b601760d4119 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -339,39 +339,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
 #endif
 
-  // Dequantization for GGML.
-  ops.def(
-      "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
-      "dtype) -> Tensor");
-  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
-
-  // mmvq kernel for GGML.
-  ops.def(
-      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) "
-      "-> Tensor");
-  ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
-
-  // mmq kernel for GGML.
-  ops.def(
-      "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
-  ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
-
-  // moe kernel for GGML.
-  ops.def(
-      "ggml_moe_a8(Tensor X, Tensor W, "
-      "Tensor sorted_token_ids, Tensor expert_ids, Tensor "
-      "num_tokens_post_padded, "
-      "int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor");
-  ops.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8);
-
-  ops.def(
-      "ggml_moe_a8_vec(Tensor X, Tensor W, "
-      "Tensor topk_ids, int top_k, "
-      "int type, SymInt row, SymInt tokens) -> Tensor");
-  ops.impl("ggml_moe_a8_vec", torch::kCUDA, &ggml_moe_a8_vec);
-
-  ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);
-
 #ifndef USE_ROCM
   // Expert-specialization mxfp8 blockscaled grouped quantization (SM100+).
   ops.def(
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 0b8fc71d3f30..549eeda98675 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -9,7 +9,6 @@ The following are the supported quantization formats for vLLM:
 
 - [AutoAWQ](auto_awq.md)
 - [BitsAndBytes](bnb.md)
-- [GGUF](gguf.md)
 - [GPTQModel](gptqmodel.md)
 - [Intel Neural Compressor](inc.md)
 - [INT4 W4A16](int4.md)
@@ -53,7 +52,6 @@ th:not(:first-child) {
 | FP8 (W8A8)                | ❌    | ❌     | ❌     | ✅︎  | ✅︎     | ✅︎      | ❌        | ❌      |
 | bitsandbytes              | ✅︎    | ✅︎     | ✅︎     | ✅︎  | ✅︎     | ❌      | ❌        | ❌      |
 | DeepSpeedFP               | ✅︎    | ✅︎     | ✅︎     | ✅︎  | ✅︎     | ❌      | ❌        | ❌      |
-| GGUF                      | ✅︎    | ✅︎     | ✅︎     | ✅︎  | ✅︎     | ✅︎      | ❌        | ❌      |
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
deleted file mode 100644
index 41912a506014..000000000000
--- a/docs/features/quantization/gguf.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# GGUF
-
-!!! warning
-    Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
-
-!!! warning
-    Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
-
-To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF):
-
-```bash
-# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
-```
-
-You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
-
-```bash
-vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
-   --tokenizer Qwen/Qwen3-0.6B \
-   --tensor-parallel-size 2
-```
-
-Alternatively, you can download and use a local GGUF file:
-
-```bash
-wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf
-vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B
-```
-
-!!! warning
-    We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
-
-GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path
-
-```bash
-# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path
-vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
-   --tokenizer Qwen/Qwen3-0.6B \
-   --hf-config-path Qwen/Qwen3-0.6B
-```
-
-You can also use the GGUF model directly through the LLM entrypoint:
-
-??? code
-
-      ```python
-      from vllm import LLM, SamplingParams
-
-      # In this script, we demonstrate how to pass input to the chat method:
-      conversation = [
-         {
-            "role": "system",
-            "content": "You are a helpful assistant",
-         },
-         {
-            "role": "user",
-            "content": "Hello",
-         },
-         {
-            "role": "assistant",
-            "content": "Hello! How can I assist you today?",
-         },
-         {
-            "role": "user",
-            "content": "Write an essay about the importance of higher education.",
-         },
-      ]
-
-      # Create a sampling params object.
-      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-      # Create an LLM using repo_id:quant_type format.
-      llm = LLM(
-         model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M",
-         tokenizer="Qwen/Qwen3-0.6B",
-      )
-      # Generate texts from the prompts. The output is a list of RequestOutput objects
-      # that contain the prompt, generated text, and other information.
-      outputs = llm.chat(conversation, sampling_params)
-
-      # Print the outputs.
-      for output in outputs:
-         prompt = output.prompt
-         generated_text = output.outputs[0].text
-         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-      ```
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 194db05e395e..07fbd7e4d555 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -32,7 +32,6 @@ def title(text: str) -> str:
         "mae": "MAE",
         "ner": "NER",
         "tpu": "TPU",
-        "gguf": "GGUF",
         "lora": "LoRA",
         "nccl": "NCCL",
         "rlhf": "RLHF",
diff --git a/requirements/common.txt b/requirements/common.txt
index b610fd678687..e4b7b339f95d 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -30,7 +30,6 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
-gguf >= 0.17.0
 mistral_common[image] >= 1.11.0
 opencv-python-headless >= 4.13.0    # required for video IO
 pyyaml
diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt
index e1efae912ee4..d58ccdef3537 100644
--- a/requirements/test/rocm.txt
+++ b/requirements/test/rocm.txt
@@ -322,10 +322,6 @@ genson==1.3.0
     # via datamodel-code-generator
 geopandas==1.1.3
     # via terratorch
-gguf==0.18.0
-    # via
-    #   -c requirements/common.txt
-    #   -r requirements/test/../common.txt
 gitdb==4.0.12
     # via gitpython
 gitpython==3.1.46
@@ -686,7 +682,6 @@ numpy==2.2.6
     #   fastparquet
     #   genai-perf
     #   geopandas
-    #   gguf
     #   h5py
     #   imagehash
     #   imageio
@@ -1119,7 +1114,6 @@ pyyaml==6.0.3
     #   datamodel-code-generator
     #   datasets
     #   genai-perf
-    #   gguf
     #   huggingface-hub
     #   jsonargparse
     #   lightning
@@ -1176,7 +1170,6 @@ requests==2.32.5
     #   diffusers
     #   docker
     #   evaluate
-    #   gguf
     #   google-api-core
     #   google-cloud-storage
     #   gpt-oss
@@ -1453,7 +1446,6 @@ tqdm==4.67.3
     #   -r requirements/test/../common.txt
     #   datasets
     #   evaluate
-    #   gguf
     #   huggingface-hub
     #   lightly
     #   lightning
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index ed4c92d90ff7..cc138454802b 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -39,12 +39,6 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
             ]
         )
 
-        # TODO: figure out why this fails.
-        if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-            TEST_MODELS.append(
-                ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {"quantization": "gguf"})
-            )
-
         if is_quant_method_supported("gptq"):
             TEST_MODELS.append(
                 ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {"quantization": "gptq"})
diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py
deleted file mode 100644
index 0dc24187f2b3..000000000000
--- a/tests/kernels/quantization/test_ggml.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import gguf
-import pytest
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm import _custom_ops as ops  # noqa: F401
-
-
-@pytest.mark.parametrize("quant_type", [12])
-def test_ggml_opcheck(quant_type):
-    block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
-    shape = [256, 1152]
-    qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8)
-    m = qweight.shape[0]
-    n = qweight.shape[1] // type_size * block_size
-    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n, torch.float16))
-
-    x = torch.rand((m, 512), device="cuda", dtype=torch.float16)
-    opcheck(torch.ops._C.ggml_mul_mat_a8, (qweight, x, quant_type, qweight.shape[0]))
-    opcheck(
-        torch.ops._C.ggml_mul_mat_vec_a8, (qweight, x, quant_type, qweight.shape[0])
-    )
-
-    shape = [256, 1024, 336]
-    qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8)
-    x = torch.rand((1, 1024), device="cuda", dtype=torch.float16)
-    sorted_token_ids = torch.arange(776, device="cuda")
-    expert_ids = torch.randint(0, 256, (194,), device="cuda")
-    num_tokens_post_padded = torch.tensor([1], dtype=torch.int64, device="cuda")
-
-    opcheck(
-        torch.ops._C.ggml_moe_a8,
-        (
-            x,
-            qweight,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            quant_type,
-            qweight.shape[0],
-            1,
-            x.shape[0],
-        ),
-    )
-
-    topk_ids = torch.zeros((1, 1), device="cuda", dtype=torch.int32)
-
-    opcheck(
-        torch.ops._C.ggml_moe_a8_vec,
-        (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]),
-    )
diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py
deleted file mode 100644
index 912d5fee4e59..000000000000
--- a/tests/kernels/quantization/test_gguf.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from pathlib import Path
-
-import pytest
-import torch
-from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
-from huggingface_hub import snapshot_download
-
-import vllm._custom_ops as ops
-from vllm.model_executor.layers.fused_moe import fused_experts
-from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
-from vllm.utils.torch_utils import set_random_seed
-
-GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
-GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
-
-
-def get_gguf_sample_tensors(
-    hidden_size: int, quant_type: GGMLQuantizationType
-) -> list[ReaderTensor]:
-    sample_dir = GGUF_SAMPLE
-    filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
-    sample_file = Path(sample_dir) / filename
-    return GGUFReader(sample_file).tensors
-
-
-def get_gguf_MoE_tensors(
-    hidden_size: int, quant_type: GGMLQuantizationType
-) -> list[ReaderTensor]:
-    sample_dir = GGUF_SAMPLE_MOE
-    filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
-    sample_file = Path(sample_dir) / filename
-    return GGUFReader(sample_file).tensors
-
-
-DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
-# Hidden_size for testing, must match the sample file in HF repo,
-# we have `hidden_size = 256, 1024` for test in HF repo currently.
-HIDDEN_SIZES = [256, 1024]
-NUM_TOKENS = [7, 2050]  # Arbitrary values for testing
-SEEDS = [0]
-QUANT_TYPES = [
-    # i-matrix
-    GGMLQuantizationType.IQ1_M,
-    GGMLQuantizationType.IQ1_S,
-    GGMLQuantizationType.IQ2_S,
-    GGMLQuantizationType.IQ2_XS,
-    GGMLQuantizationType.IQ3_S,
-    GGMLQuantizationType.IQ3_XXS,
-    GGMLQuantizationType.IQ4_NL,
-    GGMLQuantizationType.IQ4_XS,
-    # k-quants
-    GGMLQuantizationType.Q2_K,
-    GGMLQuantizationType.Q3_K,
-    GGMLQuantizationType.Q4_K,
-    GGMLQuantizationType.Q5_K,
-    GGMLQuantizationType.Q6_K,
-    # standard quantization
-    GGMLQuantizationType.Q4_0,
-    GGMLQuantizationType.Q5_0,
-    GGMLQuantizationType.Q8_0,
-]
-
-
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("quant_type", QUANT_TYPES)
-@torch.inference_mode()
-def test_dequantize(
-    hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType
-):
-    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
-    for tensor in tensors:
-        shape_str = tensor.name.split("_")[-1]
-        shape = map(int, shape_str.split("x"))
-
-        ref_output = torch.tensor(
-            dequantize(tensor.data, quant_type), device="cuda"
-        ).to(dtype)
-        output = ops.ggml_dequantize(
-            torch.tensor(tensor.data, device="cuda"), quant_type, *list(shape), dtype
-        )
-
-        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
-
-
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("quant_type", QUANT_TYPES)
-@torch.inference_mode()
-def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
-    set_random_seed(0)
-
-    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
-    x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
-    for tensor in tensors:
-        weight = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to(
-            dtype
-        )
-        ref_output = x @ weight.T
-
-        qweight = torch.tensor(tensor.data, device="cuda")
-        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type, qweight.shape[0]).to(
-            dtype
-        )
-
-        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
-
-
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize(
-    "quant_type",
-    [
-        # k-quants
-        GGMLQuantizationType.Q2_K,
-        GGMLQuantizationType.Q3_K,
-        GGMLQuantizationType.Q4_K,
-        GGMLQuantizationType.Q5_K,
-        GGMLQuantizationType.Q6_K,
-        # standard quants
-        GGMLQuantizationType.Q4_0,
-        GGMLQuantizationType.Q5_0,
-        GGMLQuantizationType.Q8_0,
-    ],
-)
-@torch.inference_mode()
-def test_mmq(
-    num_tokens: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    quant_type: GGMLQuantizationType,
-):
-    set_random_seed(0)
-
-    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
-    x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
-    for tensor in tensors:
-        weight = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to(
-            dtype
-        )
-        ref_output = x @ weight.T
-
-        qweight = torch.tensor(tensor.data, device="cuda")
-        output = ops.ggml_mul_mat_a8(qweight, x, quant_type, qweight.shape[0])
-        atols = {torch.half: 1, torch.bfloat16: 1.5, torch.float: 1.2}
-        # test matrix has inputs centered around 0 and lower precision from
-        # bfloat16 tends to accumulate and can greatly inflate rtol
-        # since outputs are also very close to 0
-        rtols = {torch.half: 1e-1, torch.bfloat16: 1e4, torch.float: 2e1}
-        torch.testing.assert_close(
-            output, ref_output, atol=atols[dtype], rtol=rtols[dtype]
-        )
-
-
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
-@pytest.mark.parametrize("hidden_size", [512])
-@pytest.mark.parametrize("top_k", [4, 8])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("quant_type", QUANT_TYPES)
-@torch.inference_mode()
-def test_moe(
-    num_tokens: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    quant_type: GGMLQuantizationType,
-    top_k: int,
-):
-    set_random_seed(0)
-    H, E = 1024, 256
-
-    x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
-
-    topk_weights = torch.rand(num_tokens, top_k, device="cuda", dtype=dtype)
-    topk_ids = torch.randint(
-        0, E, (num_tokens, top_k), device="cuda", dtype=torch.int32
-    )
-
-    tensors = get_gguf_MoE_tensors(hidden_size, quant_type)
-
-    w13 = tensors[0]
-    w2 = tensors[1]
-
-    w13_dequant = torch.tensor(dequantize(w13.data, quant_type), device="cuda").to(
-        dtype
-    )
-
-    w2_dequant = torch.tensor(dequantize(w2.data, quant_type), device="cuda").to(dtype)
-
-    output = _fused_moe_gguf(
-        x,
-        torch.tensor(w13.data, device="cuda"),
-        torch.tensor(w2.data, device="cuda"),
-        topk_weights,
-        topk_ids,
-        quant_type,
-        quant_type,
-        "silu",
-    )
-
-    ref_output = fused_experts(
-        x, w13_dequant, w2_dequant, topk_weights, topk_ids
-    ).reshape(output.shape)
-    torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py
deleted file mode 100644
index 813dccf1451b..000000000000
--- a/tests/models/multimodal/generation/test_multimodal_gguf.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-from typing import Any, NamedTuple
-
-import pytest
-from huggingface_hub import hf_hub_download
-from pytest import MarkDecorator
-from transformers import AutoModelForImageTextToText
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm.assets.image import ImageAsset
-from vllm.multimodal.image import rescale_image_size
-from vllm.utils.torch_utils import set_default_torch_num_threads
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
-from ...utils import check_logprobs_close
-
-
-class GGUFMMTestConfig(NamedTuple):
-    original_model: str
-    gguf_repo: str
-    gguf_backbone: str
-    gguf_mmproj: str
-    prompt: list[str]
-    image_names: list[str]  # Store names, load PIL images at runtime
-    max_model_len: int = 4096
-    marks: list[MarkDecorator] = []
-    mm_processor_kwargs: dict[str, Any] = {}
-
-    @property
-    def gguf_model(self):
-        hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
-        return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
-
-
-# Common prompts aligned with test_common.py "gemma3" entry format
-_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts(
-    {
-        "stop_sign": (
-            "<bos><start_of_turn>user\n"
-            "<start_of_image>What's the content in the center of the image?"
-            "<end_of_turn>\n<start_of_turn>model\n"
-        ),
-        "cherry_blossom": (
-            "<bos><start_of_turn>user\n"
-            "<start_of_image>What is the season?"
-            "<end_of_turn>\n<start_of_turn>model\n"
-        ),
-    }
-)
-
-# Image asset names - load at runtime to avoid pickle issues with subprocess
-_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"]
-
-# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF
-GEMMA3_CONFIG = GGUFMMTestConfig(
-    original_model="google/gemma-3-4b-it",
-    gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
-    gguf_backbone="gemma-3-4b-it-q4_0.gguf",
-    gguf_mmproj="mmproj-model-f16-4B.gguf",
-    prompt=_GEMMA3_PROMPTS,
-    image_names=_GEMMA3_IMAGE_NAMES,
-    max_model_len=4096,
-    marks=[pytest.mark.core_model],
-    mm_processor_kwargs={},
-)
-
-# Pan-and-scan multimodal - uses unquantized BF16 GGUF
-GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig(
-    original_model="google/gemma-3-4b-it",
-    gguf_repo="unsloth/gemma-3-4b-it-GGUF",
-    gguf_backbone="gemma-3-4b-it-BF16.gguf",
-    gguf_mmproj="mmproj-BF16.gguf",
-    prompt=_GEMMA3_PROMPTS,
-    image_names=_GEMMA3_IMAGE_NAMES,
-    max_model_len=4096,
-    marks=[pytest.mark.core_model],
-    mm_processor_kwargs={"do_pan_and_scan": True},
-)
-
-MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN]
-
-
-def run_multimodal_gguf_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    model: GGUFMMTestConfig,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-):
-    # Load images at runtime (inside subprocess) to avoid pickle issues
-    images = [ImageAsset(name).pil_image for name in model.image_names]
-    size_factors = [0.25, 0.5, 1.0]
-    inputs_per_image = [
-        (
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-        )
-        for image, prompt in zip(images, model.prompt)
-    ]
-
-    # NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork.
-    # Run GGUF model via vLLM.
-    with (
-        set_default_torch_num_threads(1),
-        vllm_runner(
-            model_name=model.gguf_model,
-            enforce_eager=True,
-            tokenizer_name=model.original_model,
-            dtype=dtype,
-            max_model_len=model.max_model_len,
-            mm_processor_kwargs=model.mm_processor_kwargs,
-        ) as gguf_model,
-    ):
-        gguf_outputs_per_case = [
-            gguf_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-            )
-            for prompts, images in inputs_per_image
-        ]
-
-    # Then run HfRunner for HuggingFace baseline comparison.
-    with hf_runner(
-        model.original_model,
-        dtype=dtype,
-        auto_cls=AutoModelForImageTextToText,
-    ) as hf_model:
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images,
-            )
-            for prompts, images in inputs_per_image
-        ]
-
-    for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=gguf_outputs,
-            name_0="hf",
-            name_1="gguf",
-        )
-
-
-@pytest.mark.skipif(
-    not is_quant_method_supported("gguf"),
-    reason="gguf is not supported on this GPU type.",
-)
-@pytest.mark.parametrize(
-    "model",
-    [
-        pytest.param(test_config, marks=test_config.marks)
-        for test_config in MODELS_TO_TEST
-    ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_gemma3_mm_gguf(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    model: GGUFMMTestConfig,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    run_multimodal_gguf_test(
-        hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs
-    )
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
deleted file mode 100644
index 064ca94f3cba..000000000000
--- a/tests/models/quantization/test_gguf.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Tests gguf models against unquantized models generations
-Note: To pass the test, quantization higher than Q4 should be used
-"""
-
-import os
-from typing import NamedTuple
-
-import pytest
-from huggingface_hub import hf_hub_download
-from pytest import MarkDecorator
-from transformers import AutoTokenizer
-
-from tests.quantization.utils import is_quant_method_supported
-
-from ...conftest import VllmRunner
-from ...utils import multi_gpu_test
-from ..utils import check_logprobs_close
-
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-MAX_MODEL_LEN = 1024
-
-
-class GGUFTestConfig(NamedTuple):
-    original_model: str
-    gguf_repo: str
-    gguf_filename: str
-    marks: list[MarkDecorator] = []
-
-    @property
-    def gguf_model(self):
-        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
-
-
-LLAMA_CONFIG = GGUFTestConfig(
-    original_model="meta-llama/Llama-3.2-1B-Instruct",
-    gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
-    gguf_filename="Llama-3.2-1B-Instruct-Q6_K.gguf",
-)
-
-QWEN2_CONFIG = GGUFTestConfig(
-    original_model="Qwen/Qwen2.5-1.5B-Instruct",
-    gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
-    gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
-)
-
-QWEN3_CONFIG = GGUFTestConfig(
-    original_model="Qwen/Qwen3-0.6B",
-    gguf_repo="unsloth/Qwen3-0.6B-GGUF",
-    gguf_filename="Qwen3-0.6B-BF16.gguf",
-)
-
-PHI3_CONFIG = GGUFTestConfig(
-    original_model="microsoft/Phi-3.5-mini-instruct",
-    gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
-    gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf",
-)
-
-GPT2_CONFIG = GGUFTestConfig(
-    original_model="openai-community/gpt2-large",
-    gguf_repo="QuantFactory/gpt2-large-GGUF",
-    gguf_filename="gpt2-large.Q4_K_M.gguf",
-)
-
-STABLELM_CONFIG = GGUFTestConfig(
-    original_model="stabilityai/stablelm-3b-4e1t",
-    gguf_repo="afrideva/stablelm-3b-4e1t-GGUF",
-    gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf",
-)
-
-STARCODER_CONFIG = GGUFTestConfig(
-    original_model="bigcode/starcoder2-3b",
-    gguf_repo="QuantFactory/starcoder2-3b-GGUF",
-    gguf_filename="starcoder2-3b.Q6_K.gguf",
-)
-
-DOLPHIN_CONFIG = GGUFTestConfig(
-    # Test VocabParallelEmbedding sharding issue.
-    original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
-    gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
-    gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
-)
-
-GEMMA3_CONFIG = GGUFTestConfig(
-    original_model="google/gemma-3-270m-it",
-    gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF",
-    gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf",
-)
-
-MODELS = [
-    # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
-    QWEN2_CONFIG,
-    QWEN3_CONFIG,
-    PHI3_CONFIG,
-    GPT2_CONFIG,
-    STABLELM_CONFIG,
-    DOLPHIN_CONFIG,
-    GEMMA3_CONFIG,
-    # STARCODER_CONFIG, # broken
-]
-
-
-def check_model_outputs(
-    vllm_runner: type[VllmRunner],
-    prompts: list[str],
-    model: GGUFTestConfig,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tp_size: int,
-):
-    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
-    if tokenizer.chat_template is not None:
-        messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
-        prompts = tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-
-    # Run gguf model.
-    with vllm_runner(
-        model_name=model.gguf_model,
-        enforce_eager=True,
-        tokenizer_name=model.original_model,
-        dtype=dtype,
-        max_model_len=MAX_MODEL_LEN,
-        tensor_parallel_size=tp_size,
-    ) as gguf_model:
-        gguf_outputs = gguf_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs
-        )
-
-    # Run unquantized model.
-    # Should run with tp=1, otherwise the test will stuck at
-    # nccl initialization.
-    with vllm_runner(
-        model_name=model.original_model,
-        enforce_eager=True,  # faster tests
-        dtype=dtype,
-        max_model_len=MAX_MODEL_LEN,
-        tensor_parallel_size=1,
-    ) as original_model:
-        original_outputs = original_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs
-        )
-
-    check_logprobs_close(
-        outputs_0_lst=original_outputs,
-        outputs_1_lst=gguf_outputs,
-        name_0="original",
-        name_1="gguf",
-    )
-
-
-@pytest.mark.skipif(
-    not is_quant_method_supported("gguf"),
-    reason="gguf is not supported on this GPU type.",
-)
-@pytest.mark.parametrize(
-    "model",
-    [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("tp_size", [1])
-def test_models(
-    vllm_runner: type[VllmRunner],
-    example_prompts: list[str],
-    model: GGUFTestConfig,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tp_size: int,
-) -> None:
-    check_model_outputs(
-        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
-    )
-
-
-@pytest.mark.skipif(
-    not is_quant_method_supported("gguf"),
-    reason="gguf is not supported on this GPU type.",
-)
-@pytest.mark.parametrize("model", [LLAMA_CONFIG])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [8])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("tp_size", [2])
-@multi_gpu_test(num_gpus=2)
-def test_distributed(
-    vllm_runner: type[VllmRunner],
-    example_prompts: list[str],
-    model: GGUFTestConfig,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tp_size: int,
-) -> None:
-    check_model_outputs(
-        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
-    )
diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py
deleted file mode 100644
index e9ca35afd66a..000000000000
--- a/tests/models/test_gguf_download.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from vllm.config import ModelConfig
-from vllm.config.load import LoadConfig
-from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
-from vllm.model_executor.model_loader.weight_utils import download_gguf
-
-
-class TestGGUFDownload:
-    """Test GGUF model downloading functionality."""
-
-    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
-    def test_download_gguf_single_file(self, mock_download):
-        """Test downloading a single GGUF file."""
-        # Setup mock
-        mock_folder = "/tmp/mock_cache"
-        mock_download.return_value = mock_folder
-
-        # Mock glob to return a single file
-        with patch("glob.glob") as mock_glob:
-            mock_glob.side_effect = lambda pattern, **kwargs: (
-                [f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else []
-            )
-
-            result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
-
-            # Verify download_weights_from_hf was called with correct patterns
-            mock_download.assert_called_once_with(
-                model_name_or_path="unsloth/Qwen3-0.6B-GGUF",
-                cache_dir=None,
-                allow_patterns=[
-                    "*-IQ1_S.gguf",
-                    "*-IQ1_S-*.gguf",
-                    "*/*-IQ1_S.gguf",
-                    "*/*-IQ1_S-*.gguf",
-                ],
-                revision=None,
-                ignore_patterns=None,
-            )
-
-            # Verify result is the file path, not folder
-            assert result == f"{mock_folder}/model-IQ1_S.gguf"
-
-    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
-    def test_download_gguf_sharded_files(self, mock_download):
-        """Test downloading sharded GGUF files."""
-        mock_folder = "/tmp/mock_cache"
-        mock_download.return_value = mock_folder
-
-        # Mock glob to return sharded files
-        with patch("glob.glob") as mock_glob:
-            mock_glob.side_effect = lambda pattern, **kwargs: (
-                [
-                    f"{mock_folder}/model-Q2_K-00001-of-00002.gguf",
-                    f"{mock_folder}/model-Q2_K-00002-of-00002.gguf",
-                ]
-                if "Q2_K" in pattern
-                else []
-            )
-
-            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
-
-            # Should return the first file after sorting
-            assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf"
-
-    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
-    def test_download_gguf_subdir(self, mock_download):
-        """Test downloading GGUF files from subdirectory."""
-        mock_folder = "/tmp/mock_cache"
-        mock_download.return_value = mock_folder
-
-        with patch("glob.glob") as mock_glob:
-            mock_glob.side_effect = lambda pattern, **kwargs: (
-                [f"{mock_folder}/Q2_K/model-Q2_K.gguf"]
-                if "Q2_K" in pattern or "**/*.gguf" in pattern
-                else []
-            )
-
-            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
-
-            assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf"
-
-    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
-    @patch("glob.glob", return_value=[])
-    def test_download_gguf_no_files_found(self, mock_glob, mock_download):
-        """Test error when no GGUF files are found."""
-        mock_folder = "/tmp/mock_cache"
-        mock_download.return_value = mock_folder
-
-        with pytest.raises(ValueError, match="Downloaded GGUF files not found"):
-            download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
-
-
-class TestGGUFModelLoader:
-    """Test GGUFModelLoader class methods."""
-
-    @patch("os.path.isfile", return_value=True)
-    def test_prepare_weights_local_file(self, mock_isfile):
-        """Test _prepare_weights with local file."""
-        load_config = LoadConfig(load_format="gguf")
-        loader = GGUFModelLoader(load_config)
-
-        # Create a simple mock ModelConfig with only the model attribute
-        model_config = MagicMock()
-        model_config.model = "/path/to/model.gguf"
-
-        result = loader._prepare_weights(model_config)
-        assert result == "/path/to/model.gguf"
-        mock_isfile.assert_called_once_with("/path/to/model.gguf")
-
-    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
-    @patch("os.path.isfile", return_value=False)
-    def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download):
-        """Test _prepare_weights with repo_id/filename.gguf format."""
-        load_config = LoadConfig(load_format="gguf")
-        loader = GGUFModelLoader(load_config)
-
-        mock_hf_download.return_value = "/downloaded/model.gguf"
-
-        # Create a simple mock ModelConfig with only the model attribute
-        model_config = MagicMock()
-        model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf"
-
-        result = loader._prepare_weights(model_config)
-        assert result == "/downloaded/model.gguf"
-        mock_hf_download.assert_called_once_with(
-            repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf"
-        )
-
-    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
-    @patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True)
-    @patch("vllm.config.model.get_config")
-    @patch("vllm.config.model.is_gguf", return_value=True)
-    @patch("vllm.model_executor.model_loader.gguf_loader.download_gguf")
-    @patch("os.path.isfile", return_value=False)
-    def test_prepare_weights_repo_quant_type(
-        self,
-        mock_isfile,
-        mock_download_gguf,
-        mock_is_gguf,
-        mock_get_config,
-        mock_file_exists,
-        mock_get_image_config,
-    ):
-        """Test _prepare_weights with repo_id:quant_type format."""
-        mock_hf_config = MagicMock()
-        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
-
-        class MockTextConfig:
-            max_position_embeddings = 4096
-            sliding_window = None
-            model_type = "qwen3"
-            num_attention_heads = 32
-
-        mock_text_config = MockTextConfig()
-        mock_hf_config.get_text_config.return_value = mock_text_config
-        mock_hf_config.dtype = "bfloat16"
-        mock_get_config.return_value = mock_hf_config
-
-        load_config = LoadConfig(load_format="gguf")
-        loader = GGUFModelLoader(load_config)
-
-        mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf"
-
-        model_config = ModelConfig(
-            model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B"
-        )
-        result = loader._prepare_weights(model_config)
-        # The actual result will be the downloaded file path from mock
-        assert result == "/downloaded/model-IQ1_S.gguf"
-        mock_download_gguf.assert_called_once_with(
-            "unsloth/Qwen3-0.6B-GGUF",
-            "IQ1_S",
-            cache_dir=None,
-            revision=None,
-            ignore_patterns=["original/**/*"],
-        )
-
-    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
-    @patch("vllm.config.model.get_config")
-    @patch("vllm.config.model.is_gguf", return_value=False)
-    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
-    @patch("os.path.isfile", return_value=False)
-    def test_prepare_weights_invalid_format(
-        self,
-        mock_isfile,
-        mock_check_gguf,
-        mock_is_gguf,
-        mock_get_config,
-        mock_get_image_config,
-    ):
-        """Test _prepare_weights with invalid format."""
-        mock_hf_config = MagicMock()
-        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
-
-        class MockTextConfig:
-            max_position_embeddings = 4096
-            sliding_window = None
-            model_type = "qwen3"
-            num_attention_heads = 32
-
-        mock_text_config = MockTextConfig()
-        mock_hf_config.get_text_config.return_value = mock_text_config
-        mock_hf_config.dtype = "bfloat16"
-        mock_get_config.return_value = mock_hf_config
-
-        load_config = LoadConfig(load_format="gguf")
-        loader = GGUFModelLoader(load_config)
-
-        # Create ModelConfig with a valid repo_id to avoid validation errors
-        # Then test _prepare_weights with invalid format
-        model_config = ModelConfig(model="unsloth/Qwen3-0.6B")
-        # Manually set model to invalid format after creation
-        model_config.model = "invalid-format"
-        with pytest.raises(ValueError, match="Unrecognised GGUF reference"):
-            loader._prepare_weights(model_config)
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index 94dd014c929f..adcb02a9300a 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -1,15 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from pathlib import Path
-from unittest.mock import patch
-
-import pytest
-
-from vllm.transformers_utils.gguf_utils import (
-    is_gguf,
-    is_remote_gguf,
-    split_remote_gguf,
-)
 from vllm.transformers_utils.utils import (
     is_azure,
     is_cloud_storage,
@@ -45,203 +35,3 @@ def test_is_cloud_storage():
     assert is_cloud_storage("az://model-container/path")
     assert not is_cloud_storage("/unix/local/path")
     assert not is_cloud_storage("nfs://nfs-fqdn.local")
-
-
-class TestIsRemoteGGUF:
-    """Test is_remote_gguf utility function."""
-
-    def test_is_remote_gguf_with_colon_and_slash(self):
-        """Test is_remote_gguf with repo_id:quant_type format."""
-        # Valid quant types (exact GGML types)
-        assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
-        assert is_remote_gguf("user/repo:Q2_K")
-        assert is_remote_gguf("repo/model:Q4_K")
-        assert is_remote_gguf("repo/model:Q8_0")
-
-        # Invalid quant types should return False
-        assert not is_remote_gguf("repo/model:quant")
-        assert not is_remote_gguf("repo/model:INVALID")
-        assert not is_remote_gguf("repo/model:invalid_type")
-
-    def test_is_remote_gguf_extended_quant_types(self):
-        """Test is_remote_gguf with extended quant type naming conventions."""
-        # Extended quant types with _M, _S, _L suffixes
-        assert is_remote_gguf("repo/model:Q4_K_M")
-        assert is_remote_gguf("repo/model:Q4_K_S")
-        assert is_remote_gguf("repo/model:Q3_K_L")
-        assert is_remote_gguf("repo/model:Q5_K_M")
-        assert is_remote_gguf("repo/model:Q3_K_S")
-
-        # Extended quant types with _XL, _XS, _XXS suffixes
-        assert is_remote_gguf("repo/model:Q5_K_XL")
-        assert is_remote_gguf("repo/model:IQ4_XS")
-        assert is_remote_gguf("repo/model:IQ3_XXS")
-
-        # Invalid extended types (base type doesn't exist)
-        assert not is_remote_gguf("repo/model:INVALID_M")
-        assert not is_remote_gguf("repo/model:Q9_K_M")
-
-    def test_is_remote_gguf_nonstandard_quant_type(self):
-        """Test is_remote_gguf with non-standard quant types containing
-        a known GGML type."""
-        # Non-standard quant types with known GGML type after prefix
-        assert is_remote_gguf("unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q4_K_XL")
-        assert is_remote_gguf("user/Model:UD-Q4_K_M")
-        assert is_remote_gguf("user/SomeModel:Custom-Q8_0")
-
-        # Exact GGML type after prefix (no suffix stripping needed)
-        assert is_remote_gguf("user/Model-GGUF:UD-IQ4_NL")
-        assert is_remote_gguf("user/Model-GGUF:UD-Q8_0")
-
-        # Completely unknown quant types should still fail
-        assert not is_remote_gguf("repo/model:TOTALLY-RANDOM")
-        assert not is_remote_gguf("user/Model:UD-INVALID")
-
-        # No dash separator → not recognized as prefixed
-        assert not is_remote_gguf("repo/model:UDIQ4NL")
-
-    def test_is_remote_gguf_without_colon(self):
-        """Test is_remote_gguf without colon."""
-        assert not is_remote_gguf("repo/model")
-        assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF")
-
-    def test_is_remote_gguf_without_slash(self):
-        """Test is_remote_gguf without slash."""
-        assert not is_remote_gguf("model.gguf")
-        # Even with valid quant_type, no slash means not remote GGUF
-        assert not is_remote_gguf("model:IQ1_S")
-        assert not is_remote_gguf("model:quant")
-
-    def test_is_remote_gguf_local_path(self):
-        """Test is_remote_gguf with local file path."""
-        assert not is_remote_gguf("/path/to/model.gguf")
-        assert not is_remote_gguf("./model.gguf")
-
-    def test_is_remote_gguf_with_path_object(self):
-        """Test is_remote_gguf with Path object."""
-        assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
-        assert not is_remote_gguf(Path("repo/model"))
-
-    def test_is_remote_gguf_with_http_https(self):
-        """Test is_remote_gguf with HTTP/HTTPS URLs."""
-        # HTTP/HTTPS URLs should return False even with valid quant_type
-        assert not is_remote_gguf("http://example.com/repo/model:IQ1_S")
-        assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K")
-        assert not is_remote_gguf("http://repo/model:Q4_K")
-        assert not is_remote_gguf("https://repo/model:Q8_0")
-
-    def test_is_remote_gguf_with_cloud_storage(self):
-        """Test is_remote_gguf with cloud storage paths."""
-        # Cloud storage paths should return False even with valid quant_type
-        assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S")
-        assert not is_remote_gguf("gs://bucket/repo/model:Q2_K")
-        assert not is_remote_gguf("s3://repo/model:Q4_K")
-        assert not is_remote_gguf("gs://repo/model:Q8_0")
-
-
-class TestSplitRemoteGGUF:
-    """Test split_remote_gguf utility function."""
-
-    def test_split_remote_gguf_valid(self):
-        """Test split_remote_gguf with valid repo_id:quant_type format."""
-        repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
-        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
-        assert quant_type == "IQ1_S"
-
-        repo_id, quant_type = split_remote_gguf("repo/model:Q2_K")
-        assert repo_id == "repo/model"
-        assert quant_type == "Q2_K"
-
-    def test_split_remote_gguf_extended_quant_types(self):
-        """Test split_remote_gguf with extended quant type naming conventions."""
-        repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:Q4_K_M")
-        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
-        assert quant_type == "Q4_K_M"
-
-        repo_id, quant_type = split_remote_gguf("repo/model:Q3_K_S")
-        assert repo_id == "repo/model"
-        assert quant_type == "Q3_K_S"
-
-    def test_split_remote_gguf_nonstandard_quant_type(self):
-        """Test split_remote_gguf with non-standard quant types in GGUF repos."""
-        repo_id, quant_type = split_remote_gguf(
-            "unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q4_K_XL"
-        )
-        assert repo_id == "unsloth/Qwen3.5-35B-A3B-GGUF"
-        assert quant_type == "UD-Q4_K_XL"
-
-    def test_split_remote_gguf_with_path_object(self):
-        """Test split_remote_gguf with Path object."""
-        repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
-        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
-        assert quant_type == "IQ1_S"
-
-    def test_split_remote_gguf_invalid(self):
-        """Test split_remote_gguf with invalid format."""
-        # Invalid format (no colon) - is_remote_gguf returns False
-        with pytest.raises(ValueError, match="Wrong GGUF model"):
-            split_remote_gguf("repo/model")
-
-        # Invalid quant type - is_remote_gguf returns False
-        with pytest.raises(ValueError, match="Wrong GGUF model"):
-            split_remote_gguf("repo/model:INVALID_TYPE")
-
-        # HTTP URL - is_remote_gguf returns False
-        with pytest.raises(ValueError, match="Wrong GGUF model"):
-            split_remote_gguf("http://repo/model:IQ1_S")
-
-        # Cloud storage - is_remote_gguf returns False
-        with pytest.raises(ValueError, match="Wrong GGUF model"):
-            split_remote_gguf("s3://bucket/repo/model:Q2_K")
-
-
-class TestIsGGUF:
-    """Test is_gguf utility function."""
-
-    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=True)
-    def test_is_gguf_with_local_file(self, mock_check_gguf):
-        """Test is_gguf with local GGUF file."""
-        assert is_gguf("/path/to/model.gguf")
-        assert is_gguf("./model.gguf")
-
-    def test_is_gguf_with_remote_gguf(self):
-        """Test is_gguf with remote GGUF format."""
-        # Valid remote GGUF format (repo_id:quant_type with valid quant_type)
-        assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
-        assert is_gguf("repo/model:Q2_K")
-        assert is_gguf("repo/model:Q4_K")
-
-        # Extended quant types with suffixes
-        assert is_gguf("repo/model:Q4_K_M")
-        assert is_gguf("repo/model:Q3_K_S")
-        assert is_gguf("repo/model:Q5_K_L")
-
-        # Invalid quant_type should return False
-        assert not is_gguf("repo/model:quant")
-        assert not is_gguf("repo/model:INVALID")
-
-    @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
-    def test_is_gguf_false(self, mock_check_gguf):
-        """Test is_gguf returns False for non-GGUF models."""
-        assert not is_gguf("unsloth/Qwen3-0.6B")
-        assert not is_gguf("repo/model")
-        assert not is_gguf("model")
-
-    def test_is_gguf_edge_cases(self):
-        """Test is_gguf with edge cases."""
-        # Empty string
-        assert not is_gguf("")
-
-        # Only colon, no slash (even with valid quant_type)
-        assert not is_gguf("model:IQ1_S")
-
-        # Only slash, no colon
-        assert not is_gguf("repo/model")
-
-        # HTTP/HTTPS URLs
-        assert not is_gguf("http://repo/model:IQ1_S")
-        assert not is_gguf("https://repo/model:Q2_K")
-
-        # Cloud storage
-        assert not is_gguf("s3://bucket/repo/model:IQ1_S")
-        assert not is_gguf("gs://bucket/repo/model:Q2_K")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 01ac03f27a3f..eada640e7612 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -764,69 +764,6 @@ def _allspark_w8a16_gemm_fake(
         return torch.empty((m, n), device=a.device, dtype=a.dtype)
 
 
-if hasattr(torch.ops._C, "ggml_dequantize"):
-
-    @register_fake("_C::ggml_dequantize")
-    def _ggml_dequantize_fake(
-        W: torch.Tensor,
-        quant_type: int,
-        m: torch.SymInt,
-        n: torch.SymInt,
-        dtype: torch.dtype | None = None,
-    ) -> torch.Tensor:
-        return torch.empty((m, n), dtype=torch.float16, device=W.device)
-
-    @register_fake("_C::ggml_mul_mat_vec_a8")
-    def _ggml_mul_mat_vec_a8_fake(
-        W: torch.Tensor,
-        X: torch.Tensor,
-        quant_type: int,
-        row: torch.SymInt,
-    ) -> torch.Tensor:
-        return torch.empty((X.shape[0], row), dtype=X.dtype, device=W.device)
-
-    @register_fake("_C::ggml_mul_mat_a8")
-    def _ggml_mul_mat_a8_fake(
-        W: torch.Tensor,
-        X: torch.Tensor,
-        quant_type: int,
-        row: torch.SymInt,
-    ) -> torch.Tensor:
-        batch = X.size(0)
-        return torch.empty((batch, row), dtype=X.dtype, device=W.device)
-
-    @register_fake("_C::ggml_moe_a8")
-    def _ggml_moe_a8_fake(
-        X: torch.Tensor,
-        W: torch.Tensor,
-        sorted_token_ids: torch.Tensor,
-        expert_ids: torch.Tensor,
-        num_tokens_post_padded: torch.Tensor,
-        quant_type: int,
-        row: torch.SymInt,
-        top_k: torch.SymInt,
-        tokens: torch.SymInt,
-    ) -> torch.Tensor:
-        tokens = X.size(0)
-        return torch.empty((tokens * top_k, row), dtype=torch.float16, device=W.device)
-
-
-if hasattr(torch.ops._C, "ggml_moe_a8_vec"):
-
-    @register_fake("_C::ggml_moe_a8_vec")
-    def _ggml_moe_a8_vec_fake(
-        X: torch.Tensor,
-        W: torch.Tensor,
-        topk_ids: torch.Tensor,
-        top_k: int,
-        quant_type: int,
-        row: torch.SymInt,
-        tokens: torch.SymInt,
-    ) -> torch.Tensor:
-        tokens = X.size(0)
-        return torch.empty((tokens * top_k, row), dtype=X.dtype, device=W.device)
-
-
 # cutlass
 def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
@@ -2041,71 +1978,6 @@ def scaled_int8_quant(
     return output, input_scales, input_azp
 
 
-# gguf
-def ggml_dequantize(
-    W: torch.Tensor, quant_type: int, m: int, n: int, dtype: torch.dtype | None
-) -> torch.Tensor:
-    return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype)
-
-
-def ggml_mul_mat_vec_a8(
-    W: torch.Tensor,
-    X: torch.Tensor,
-    quant_type: int,
-    row: int,
-) -> torch.Tensor:
-    return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row)
-
-
-def ggml_mul_mat_a8(
-    W: torch.Tensor,
-    X: torch.Tensor,
-    quant_type: int,
-    row: int,
-) -> torch.Tensor:
-    return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row)
-
-
-def ggml_moe_a8(
-    X: torch.Tensor,
-    W: torch.Tensor,
-    sorted_token_ids: torch.Tensor,
-    expert_ids: torch.Tensor,
-    num_tokens_post_padded: torch.Tensor,
-    quant_type: int,
-    row: int,
-    top_k: int,
-    tokens: int,
-) -> torch.Tensor:
-    return torch.ops._C.ggml_moe_a8(
-        X,
-        W,
-        sorted_token_ids,
-        expert_ids,
-        num_tokens_post_padded,
-        quant_type,
-        row,
-        top_k,
-        tokens,
-    )
-
-
-def ggml_moe_a8_vec(
-    X: torch.Tensor,
-    W: torch.Tensor,
-    topk_ids: torch.Tensor,
-    top_k: int,
-    quant_type: int,
-    row: torch.SymInt,
-    tokens: torch.SymInt,
-) -> torch.Tensor:
-    return torch.ops._C.ggml_moe_a8_vec(X, W, topk_ids, top_k, quant_type, row, tokens)
-
-
-def ggml_moe_get_block_size(quant_type: int) -> int:
-    return torch.ops._C.ggml_moe_get_block_size(quant_type)
-
-
 # mamba
 def selective_scan_fwd(
     u: torch.Tensor,
diff --git a/vllm/config/load.py b/vllm/config/load.py
index 93240ec5fc0f..e27c1ce0fd0c 100644
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -48,8 +48,6 @@ class LoadConfig:
     - "bitsandbytes" will load the weights using bitsandbytes quantization.
     - "sharded_state" will load weights from pre-sharded checkpoint files,
       supporting efficient loading of tensor-parallel models.
-    - "gguf" will load weights from GGUF format files (details specified in
-      https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
     - "mistral" will load weights from consolidated safetensors files used by
       Mistral models.
     - Other custom values can be supported via plugins.
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 1cce7f9d94cc..054f14a26fef 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -25,6 +25,7 @@
 from vllm.config.scheduler import RunnerType
 from vllm.config.utils import config, getattr_iter
 from vllm.logger import init_logger
+from vllm.model_format import get_model_format_handler
 from vllm.platforms import current_platform
 from vllm.tasks import PoolingTask, ScoreType, SupportedTask
 from vllm.transformers_utils.config import (
@@ -42,12 +43,6 @@
     uses_mrope,
     uses_xdrope_dim,
 )
-from vllm.transformers_utils.gguf_utils import (
-    is_gguf,
-    is_remote_gguf,
-    maybe_patch_hf_config_from_gguf,
-    split_remote_gguf,
-)
 from vllm.transformers_utils.model_arch_config_convertor import (
     MODEL_ARCH_CONFIG_CONVERTORS,
     ModelArchConfigConvertorBase,
@@ -503,10 +498,8 @@ def __post_init__(
             hf_overrides_fn=hf_overrides_fn,
             token=self.hf_token,
         )
-        hf_config = maybe_patch_hf_config_from_gguf(
-            self.model,
-            hf_config,
-        )
+        if handler := get_model_format_handler(self.model):
+            hf_config = handler.patch_model_hf_config(self.model, hf_config)
 
         self.hf_config = hf_config
         if dict_overrides:
@@ -666,12 +659,8 @@ def __post_init__(
                 )
 
         # Multimodal GGUF models must use original repo for mm processing
-        if is_gguf(self.tokenizer) and self.is_multimodal_model:
-            raise ValueError(
-                "Loading a multimodal GGUF model needs to use original "
-                "tokenizer. Please specify the unquantized hf model's "
-                "repo name or path using the --tokenizer argument."
-            )
+        if handler := get_model_format_handler(self.model):
+            handler.validate_model_config(self)
 
         if self.disable_sliding_window:
             # Set after get_and_verify_max_len to ensure that max_model_len
@@ -826,8 +815,8 @@ def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> No
 
     def _get_encoder_config(self) -> dict[str, Any] | None:
         model = self.model
-        if is_remote_gguf(model):
-            model, _ = split_remote_gguf(model)
+        if handler := get_model_format_handler(model):
+            model = handler.resolve_sentence_transformer_source(model, self.revision)
         return get_sentence_transformer_tokenizer_config(model, self.revision)
 
     def _get_default_runner_type(
@@ -952,7 +941,6 @@ def _verify_quantization(self) -> None:
                 # imports during override detection (e.g., MXFP4 imports Triton)
                 "mxfp4",
                 "cpu_awq",
-                "gguf",
             ]
             quantization_methods = [
                 q for q in supported_quantization if q not in overrides
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c9b90848ff04..b42904367883 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -93,6 +93,7 @@
 from vllm.config.utils import get_field
 from vllm.config.vllm import OptimizationLevel, PerformanceMode
 from vllm.logger import init_logger, suppress_logging
+from vllm.model_format import get_model_format_handler
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
@@ -100,7 +101,6 @@
     is_interleaved,
     maybe_override_with_speculators,
 )
-from vllm.transformers_utils.gguf_utils import is_gguf
 from vllm.transformers_utils.repo_utils import get_model_path
 from vllm.transformers_utils.utils import is_cloud_storage
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -1416,9 +1416,10 @@ def from_cli_args(cls, args: argparse.Namespace):
         return engine_args
 
     def create_model_config(self) -> ModelConfig:
-        # gguf file needs a specific model loader
-        if is_gguf(self.model):
-            self.quantization = self.load_format = "gguf"
+        load_general_plugins()
+
+        if handler := get_model_format_handler(self.model):
+            handler.update_engine_args(self)
 
         if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
             logger.warning(
@@ -1559,6 +1560,7 @@ def create_engine_config(
         NOTE: If VllmConfig is incompatible, we raise an error.
         """
         current_platform.pre_register_and_update()
+        load_general_plugins()
 
         device_config = DeviceConfig(device=cast(Device, current_platform.device_type))
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index c4fc1fd2557e..178aa361602d 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1113,9 +1113,11 @@ def weight_loader(
         # dimension intermediate_size_per_partition is used.
         SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
 
-        is_gguf_weight = getattr(param, "is_gguf_weight", False)
-        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
-        if is_gguf_weight_type:
+        needs_custom_weight_materialization = getattr(
+            param, "needs_custom_weight_materialization", False
+        )
+        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
+        if needs_custom_weight_type:
             param.weight_type = loaded_weight.item()
             param.data.copy_(loaded_weight)
             return True if return_success else None
@@ -1165,8 +1167,9 @@ def weight_loader(
         if full_load:
             shard_dim += 1
 
-        # Materialize GGUF UninitializedParameter accounting merged weights
-        if is_gguf_weight and isinstance(param, UninitializedParameter):
+        if needs_custom_weight_materialization and isinstance(
+            param, UninitializedParameter
+        ):
             # To materialize a tensor, we must have full shape including
             # number of experts, making this portion to require `full_load`.
             assert full_load
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 975fedabd675..297a10bd3942 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -360,15 +360,16 @@ def __init__(
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # If the weight on disk does not have a shape, give it one
         # (such scales for AutoFp8).
-        # Special case for GGUF
-
-        is_gguf_weight = getattr(param, "is_gguf_weight", False)
-        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
-        if is_gguf_weight_type:
+        needs_custom_weight_materialization = getattr(
+            param, "needs_custom_weight_materialization", False
+        )
+        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
+        if needs_custom_weight_type:
             param.weight_type = loaded_weight.item()
 
-        # Materialize GGUF UninitializedParameter
-        if is_gguf_weight and isinstance(param, UninitializedParameter):
+        if needs_custom_weight_materialization and isinstance(
+            param, UninitializedParameter
+        ):
             param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
 
         if len(loaded_weight.shape) == 0:
@@ -534,14 +535,16 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # no need to narrow
         is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
-        # Special case for GGUF
-        is_gguf_weight = getattr(param, "is_gguf_weight", False)
-        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
-        if is_gguf_weight_type:
+        needs_custom_weight_materialization = getattr(
+            param, "needs_custom_weight_materialization", False
+        )
+        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
+        if needs_custom_weight_type:
             param.weight_type = loaded_weight.item()
 
-        # Materialize GGUF UninitializedParameter
-        if is_gguf_weight and isinstance(param, UninitializedParameter):
+        if needs_custom_weight_materialization and isinstance(
+            param, UninitializedParameter
+        ):
             final_shape = list(loaded_weight.shape)
             if output_dim is not None:
                 assert final_shape[output_dim] % self.tp_size == 0
@@ -692,17 +695,18 @@ def weight_loader(
         loaded_shard_id: tuple[int, ...] | int | None = None,
     ):
         self.validate_shard_id(loaded_shard_id)
-        # Special case for GGUF
-        # initialize GGUF param after we know the quantize type
-        is_gguf_weight = getattr(param, "is_gguf_weight", False)
-        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        needs_custom_weight_materialization = getattr(
+            param, "needs_custom_weight_materialization", False
+        )
+        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
         if isinstance(loaded_shard_id, tuple) and (
-            is_gguf_weight or is_gguf_weight_type
+            needs_custom_weight_materialization or needs_custom_weight_type
         ):
             raise NotImplementedError(
-                "Shard id with multiple indices is not supported for GGUF."
+                "Shard id with multiple indices is not supported for this "
+                "format-specific weight loader."
             )
-        if is_gguf_weight_type:
+        if needs_custom_weight_type:
             if loaded_shard_id is not None:
                 param.data[loaded_shard_id].copy_(loaded_weight)
                 param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
@@ -712,7 +716,7 @@ def weight_loader(
                 }
             return
 
-        if is_gguf_weight:
+        if needs_custom_weight_materialization:
             output_dim = getattr(param, "output_dim", None)
             shard_size = loaded_weight.size(output_dim) // self.tp_size
             start_idx = self.tp_rank * shard_size
@@ -1168,11 +1172,11 @@ def weight_loader(
         loaded_shard_id: str | None = None,
     ):
         self.validate_shard_id(loaded_shard_id)
-        # Special case for GGUF
-        # initialize GGUF param after we know the quantize type
-        is_gguf_weight = getattr(param, "is_gguf_weight", False)
-        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
-        if is_gguf_weight_type:
+        needs_custom_weight_materialization = getattr(
+            param, "needs_custom_weight_materialization", False
+        )
+        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
+        if needs_custom_weight_type:
             idx_map = {"q": 0, "k": 1, "v": 2}
             if loaded_shard_id is not None:
                 param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
@@ -1181,7 +1185,7 @@ def weight_loader(
                 param.shard_weight_type = {k: loaded_weight.item() for k in idx_map}
             return
 
-        if is_gguf_weight:
+        if needs_custom_weight_materialization:
             output_dim = getattr(param, "output_dim", None)
             shard_size = loaded_weight.size(output_dim) // self.tp_size
             start_idx = self.tp_rank * shard_size
@@ -1480,14 +1484,16 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # no need to narrow
         is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
-        # Special case for GGUF
-        is_gguf_weight = getattr(param, "is_gguf_weight", False)
-        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
-        if is_gguf_weight_type:
+        needs_custom_weight_materialization = getattr(
+            param, "needs_custom_weight_materialization", False
+        )
+        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
+        if needs_custom_weight_type:
             param.weight_type = loaded_weight.item()
 
-        # Materialize GGUF UninitializedParameter
-        if is_gguf_weight and isinstance(param, UninitializedParameter):
+        if needs_custom_weight_materialization and isinstance(
+            param, UninitializedParameter
+        ):
             weight_shape = list(loaded_weight.shape)
             if input_dim:
                 weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 1ac0f9ee9cc5..b2fb6a528a5d 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -18,7 +18,6 @@
     "modelopt_fp4",
     "modelopt_mxfp8",
     "modelopt_mixed",
-    "gguf",
     "gptq_marlin",
     "awq_marlin",
     "gptq",
@@ -122,7 +121,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .fbgemm_fp8 import FBGEMMFp8Config
     from .fp8 import Fp8Config
     from .fp_quant import FPQuantConfig
-    from .gguf import GGUFConfig
     from .gptq import GPTQConfig
     from .gptq_marlin import GPTQMarlinConfig
     from .inc import INCConfig
@@ -147,7 +145,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "modelopt_fp4": ModelOptNvFp4Config,
         "modelopt_mxfp8": ModelOptMxFp8Config,
         "modelopt_mixed": ModelOptMixedPrecisionConfig,
-        "gguf": GGUFConfig,
         "gptq_marlin": GPTQMarlinConfig,
         "awq_marlin": AWQMarlinConfig,
         "gptq": GPTQConfig,
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index eedc62f7d4d5..4ed906598b7d 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -120,6 +120,10 @@ def override_quantization_method(
         """
         return None
 
+    @classmethod
+    def requires_hf_quant_config(cls) -> bool:
+        return True
+
     @staticmethod
     def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any:
         """Get a value from the model's quantization config."""
@@ -156,6 +160,28 @@ def get_quant_method(
     def get_cache_scale(self, name: str) -> str | None:
         return None
 
+    def override_is_neox_style(self, model_type: str) -> bool | None:
+        return None
+
+    def should_keep_tied_lm_head(self) -> bool:
+        return False
+
+    def transform_loaded_weight(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+    ) -> torch.Tensor:
+        return loaded_weight
+
+    def remap_loaded_parameter(
+        self,
+        name: str,
+        param: torch.Tensor,
+        loaded_weight: torch.Tensor,
+        params_dict: dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        return param
+
     def apply_vllm_mapper(  # noqa: B027
         self, hf_to_vllm_mapper: "WeightsMapper"
     ):
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
deleted file mode 100644
index 2a72da26cc62..000000000000
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ /dev/null
@@ -1,691 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from collections.abc import Mapping
-from types import MappingProxyType
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from vllm.model_executor.layers.quantization import QuantizationMethods
-
-import gguf
-import torch
-from gguf import GGMLQuantizationType as WeightType
-from torch.nn.parameter import Parameter, UninitializedParameter
-
-from vllm import _custom_ops as ops
-from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.activation import (
-    MoEActivation,
-    apply_moe_activation,
-)
-from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig,
-    FusedMoEQuantConfig,
-)
-from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE,
-    FusedMoEMethodBase,
-)
-from vllm.model_executor.layers.linear import (
-    LinearBase,
-    LinearMethodBase,
-    UnquantizedLinearMethod,
-)
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig,
-    QuantizeMethodBase,
-)
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    UnquantizedEmbeddingMethod,
-    VocabParallelEmbedding,
-)
-from vllm.model_executor.models.utils import WeightsMapper
-from vllm.model_executor.utils import set_weight_attrs
-from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
-
-logger = init_logger(__name__)
-
-
-class GGUFConfig(QuantizationConfig):
-    """Config class for GGUF."""
-
-    def __init__(self, unquantized_modules: list[str] | None = None) -> None:
-        super().__init__()
-        self.unquantized_modules = unquantized_modules or []
-
-    def __repr__(self) -> str:
-        return "GGUFConfig()"
-
-    def get_name(self) -> QuantizationMethods:
-        return "gguf"
-
-    def get_supported_act_dtypes(self) -> list[torch.dtype]:
-        # GGUF dequantization kernels use half precision (fp16) internally.
-        # bfloat16 has precision issues on Blackwell devices.
-        if current_platform.has_device_capability(100):
-            logger.warning_once("GGUF has precision issues with bfloat16 on Blackwell.")
-            return [torch.half, torch.float32]
-        return [torch.half, torch.bfloat16, torch.float32]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 60
-
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return []  # no extra configs.
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "GGUFConfig":
-        return cls()
-
-    @classmethod
-    def override_quantization_method(
-        cls, hf_quant_cfg: dict[str, Any], user_quant: str | None
-    ) -> "QuantizationMethods | None":
-        # When user explicitly specifies --quantization gguf, override
-        # whatever quantization method is in the HF model config (e.g. fp8).
-        if user_quant == "gguf":
-            return "gguf"
-        return None
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> "QuantizeMethodBase | None":
-        if isinstance(layer, LinearBase):
-            if is_layer_skipped_gguf(
-                prefix, self.unquantized_modules, self.packed_modules_mapping
-            ):
-                return UnquantizedLinearMethod()
-            return GGUFLinearMethod(self)
-        elif isinstance(layer, VocabParallelEmbedding):
-            if is_layer_skipped_gguf(
-                prefix, self.unquantized_modules, self.packed_modules_mapping
-            ):
-                return UnquantizedEmbeddingMethod()
-            return GGUFEmbeddingMethod(self)
-        elif isinstance(layer, FusedMoE):
-            # TODO: Select UnquantizedFusedMoEMethod on unquantized layers.
-            return GGUFMoEMethod(self, layer.moe_config)
-        return None
-
-    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        """
-        Interface for models to update module names referenced in
-        quantization configs in order to reflect the vllm model structure
-
-        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
-            structure of the qconfig) to vllm model structure
-        """
-        if self.unquantized_modules is not None:
-            self.unquantized_modules = hf_to_vllm_mapper.apply_list(
-                self.unquantized_modules
-            )
-
-
-def is_layer_skipped_gguf(
-    prefix: str,
-    unquantized_modules: list[str],
-    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
-):
-    # Fused layers like gate_up_proj or qkv_proj will not be fused
-    # in the safetensors checkpoint. So, we convert the name
-    # from the fused version to unfused + check to make sure that
-    # each shard of the fused layer has the same scheme.
-    proj_name = prefix.split(".")[-1]
-    if proj_name in fused_mapping:
-        shard_prefixes = [
-            prefix.replace(proj_name, shard_proj_name)
-            for shard_proj_name in fused_mapping[proj_name]
-        ]
-
-        is_skipped = None
-        for shard_prefix in shard_prefixes:
-            is_shard_skipped = any(
-                shard_prefix in module_name for module_name in unquantized_modules
-            )
-
-            if is_skipped is None:
-                is_skipped = is_shard_skipped
-            elif is_shard_skipped != is_skipped:
-                raise ValueError(
-                    f"Detected some but not all shards of {prefix} "
-                    "are quantized. All shards of fused layers "
-                    "to have the same precision."
-                )
-    else:
-        is_skipped = any(module_name in prefix for module_name in unquantized_modules)
-
-    assert is_skipped is not None
-    return is_skipped
-
-
-UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
-STANDARD_QUANT_TYPES = {
-    WeightType.Q4_0,
-    WeightType.Q4_1,
-    WeightType.Q5_0,
-    WeightType.Q5_1,
-    WeightType.Q8_0,
-    WeightType.Q8_1,
-}
-KQUANT_TYPES = {
-    WeightType.Q2_K,
-    WeightType.Q3_K,
-    WeightType.Q4_K,
-    WeightType.Q5_K,
-    WeightType.Q6_K,
-}
-IMATRIX_QUANT_TYPES = {
-    WeightType.IQ1_M,
-    WeightType.IQ1_S,
-    WeightType.IQ2_XXS,
-    WeightType.IQ2_XS,
-    WeightType.IQ2_S,
-    WeightType.IQ3_XXS,
-    WeightType.IQ3_S,
-    WeightType.IQ4_XS,
-    WeightType.IQ4_NL,
-}
-# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
-# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
-# MMQ kernel for I-Matrix quantization.
-DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
-MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
-MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
-
-
-def _fused_mul_mat_gguf(
-    x: torch.Tensor, qweight: torch.Tensor, qweight_type: int
-) -> torch.Tensor:
-    if qweight_type in IMATRIX_QUANT_TYPES:
-        mmvq_safe = 8 if qweight.shape[0] > 5120 else 16
-    else:
-        mmvq_safe = 2 if qweight.shape[0] > 5120 else 6
-    # HACK: when doing chunked prefill we don't generate output tokens
-    # so input to logits generator is empty which causes invalid parameter
-    if x.shape[0] == 0:
-        return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device)
-    # there is no need to call any kernel for fp16/bf16
-    if qweight_type in UNQUANTIZED_TYPES:
-        return x @ qweight.T
-    # enable MMVQ in contiguous batching with batch_size=1
-    if x.shape[0] <= mmvq_safe and qweight_type in MMVQ_QUANT_TYPES:
-        y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
-    # Use MMQ Kernel if it's available (standard + k-quants)
-    elif qweight_type in MMQ_QUANT_TYPES:
-        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
-    # If there is no available MMQ kernel, fallback to dequantize
-    elif qweight_type in DEQUANT_TYPES:
-        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
-        shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
-        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
-        y = x @ weight.T
-    else:
-        # Raise an error if the quantization type is not supported.
-        # Might be useful if llama.cpp adds a new quantization type.
-        # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
-        qweight_type = WeightType(qweight_type)
-        raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
-    return y
-
-
-def _fused_mul_mat_gguf_fake(
-    x: torch.Tensor,
-    qweight: torch.Tensor,
-    qweight_type: int,
-) -> torch.Tensor:
-    return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device)
-
-
-try:
-    direct_register_custom_op(
-        op_name="_fused_mul_mat_gguf",
-        op_func=_fused_mul_mat_gguf,
-        fake_impl=_fused_mul_mat_gguf_fake,
-    )
-    fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf
-
-except AttributeError as error:
-    raise error
-
-
-def _fused_moe_gguf(
-    x: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    qweight_type: int,
-    qweight_type2: int,
-    activation: str,
-) -> torch.Tensor:
-    activation_enum = MoEActivation.from_str(activation)
-
-    def act(x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        apply_moe_activation(activation_enum, out, x)
-        return out
-
-    # lazy import to avoid triggering triton import in CPU backend
-    from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
-
-    out_hidden_states = torch.empty_like(x)
-    # unless we decent expert reuse we are better off running moe_vec kernel
-    if (
-        qweight_type2 in MMQ_QUANT_TYPES
-        and qweight_type in MMQ_QUANT_TYPES
-        and x.shape[0] > 64
-    ):
-        num_tokens, _ = x.shape
-        E, N, _ = w1.shape
-        top_k = topk_ids.shape[1]
-        BLOCK_SIZE = ops.ggml_moe_get_block_size(qweight_type)
-
-        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-            topk_ids, BLOCK_SIZE, E
-        )
-        out = ops.ggml_moe_a8(
-            x,
-            w1,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            qweight_type,
-            N,
-            top_k,
-            num_tokens,
-        )
-        out = act(out)
-        out = ops.ggml_moe_a8(
-            out,
-            w2,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            qweight_type2,
-            w2.shape[1],
-            1,
-            num_tokens * top_k,
-        )
-        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
-            topk_weights.view(num_tokens, top_k, 1)
-        )
-        ops.moe_sum(out, out_hidden_states)
-    elif qweight_type2 in MMVQ_QUANT_TYPES and qweight_type in MMVQ_QUANT_TYPES:
-        num_tokens, _ = x.shape
-        E, N, _ = w1.shape
-        top_k = topk_ids.shape[1]
-
-        out = ops.ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N, num_tokens)
-        out = act(out)
-
-        out = ops.ggml_moe_a8_vec(
-            out, w2, topk_ids, 1, qweight_type2, w2.shape[1], num_tokens * top_k
-        )
-        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
-            topk_weights.view(num_tokens, top_k, 1)
-        )
-        ops.moe_sum(out, out_hidden_states)
-    else:
-        logger.warning_once(
-            "There is no support for fast MoE kernel "
-            "for current quantization method. "
-            "Falling back to slow implementation. "
-        )
-        for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)):
-            inp = x[tok].reshape((1,) + x.shape[1:])
-            current_hidden_state = None
-            for ww, ii in zip(w, idx):
-                expert_up = w1[ii]
-
-                out = fused_mul_mat_gguf(inp, expert_up, qweight_type)
-                out = act(out)
-
-                expert_down = w2[ii]
-                current_state = fused_mul_mat_gguf(
-                    out, expert_down, qweight_type2
-                ).mul_(ww)
-                if current_hidden_state is None:
-                    current_hidden_state = current_state
-                else:
-                    current_hidden_state.add_(current_state)
-            out_hidden_states[tok] = current_hidden_state
-    return out_hidden_states
-
-
-def _fused_moe_gguf_fake(
-    x: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    qweight_type: int,
-    qweight_type2: int,
-    activation: str,
-) -> torch.Tensor:
-    return torch.empty_like(x)
-
-
-try:
-    direct_register_custom_op(
-        op_name="_fused_moe_gguf",
-        op_func=_fused_moe_gguf,
-        fake_impl=_fused_moe_gguf_fake,
-    )
-    fused_moe_gguf = torch.ops.vllm._fused_moe_gguf
-
-except AttributeError as error:
-    raise error
-
-
-def _apply_gguf_embedding(
-    x: torch.Tensor,
-    qweight: torch.Tensor,
-    qweight_type: int,
-    hidden_size: int,
-    dtype: torch.dtype | None = None,
-) -> torch.Tensor:
-    if qweight_type in UNQUANTIZED_TYPES:
-        return torch.embedding(qweight, x)
-    elif qweight_type in DEQUANT_TYPES:
-        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
-        x_flat = x.flatten()
-        assert hidden_size == qweight.shape[1] // type_size * block_size
-        quant = torch.index_select(qweight, dim=0, index=x_flat)
-        dequant = ops.ggml_dequantize(
-            quant, qweight_type, hidden_size, x_flat.shape[0], dtype
-        )
-        return dequant.view(*x.shape, hidden_size)
-    else:
-        qweight_type = WeightType(qweight_type)
-        raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
-
-
-def _apply_gguf_embedding_fake(
-    x: torch.Tensor,
-    qweight: torch.Tensor,
-    qweight_type: int,
-    hidden_size: int,
-    dtype: torch.dtype | None = None,
-) -> torch.Tensor:
-    return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device)
-
-
-try:
-    direct_register_custom_op(
-        op_name="_apply_gguf_embedding",
-        op_func=_apply_gguf_embedding,
-        fake_impl=_apply_gguf_embedding_fake,
-    )
-    apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding
-
-except AttributeError as error:
-    raise error
-
-
-class GGUFLinearMethod(LinearMethodBase):
-    """Linear method for GGUF.
-
-    Args:
-        quant_config: The GGUF quantization config.
-    """
-
-    def __init__(self, quant_config: GGUFConfig):
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        self.params_dtype = params_dtype
-        output_size_per_partition = sum(output_partition_sizes)
-
-        tensor_shape = (output_size_per_partition, input_size_per_partition)
-        qweight = GGUFUninitializedParameter(requires_grad=False)
-        set_weight_attrs(
-            qweight,
-            {
-                "input_dim": 1,
-                "output_dim": 0,
-                "tensor_shape": tensor_shape,
-                "is_gguf_weight": True,
-                "data_container": [],
-                "shard_id": [],
-                "shard_id_map": {},
-            },
-        )
-        set_weight_attrs(qweight, extra_weight_attrs)
-        layer.register_parameter("qweight", qweight)
-
-        qweight_type = Parameter(
-            torch.empty(len(output_partition_sizes), dtype=torch.uint8),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            qweight_type,
-            {
-                "is_gguf_weight_type": True,
-                "weight_type": 0,
-                "shard_weight_type": {},
-                "ignore_warning": True,
-            },
-        )
-        set_weight_attrs(qweight_type, extra_weight_attrs)
-        layer.register_parameter("qweight_type", qweight_type)
-
-    def process_weights_after_loading(self, layer: torch.nn.Module):
-        qweight_type = layer.qweight_type.weight_type
-        if not (qweight_type in UNQUANTIZED_TYPES or qweight_type in DEQUANT_TYPES):
-            qweight_type = WeightType(qweight_type)
-            raise ValueError(
-                f"Unsupported GGUF quantization type {qweight_type} in layer {layer}."
-            )
-        # For MergedColumnParallelLinear and QKVParallelLinear, we need to
-        # materialize the padded weight parameter for CUDA Graph compatibility.
-        self._create_padded_weight_param(layer)
-
-    def _create_padded_weight_param(self, layer: torch.nn.Module):
-        """Create padded weight parameter for GGUF MergedLinear layer."""
-        qweight = layer.qweight
-        shard_id_map = qweight.shard_id_map
-        shard_id = qweight.shard_id
-        if len(data_container := qweight.data_container) > 1:
-            dtype = {data.dtype for data in data_container}
-            assert len(dtype) == 1, ValueError(
-                f"Data container has mixed dtypes: {dtype}"
-            )
-            dtype = next(iter(dtype))
-            # concat dim0 and pad dim1
-            padded_side = max(x.size(1) for x in data_container)
-            concat_side = sum(x.size(0) for x in data_container)
-            # Pad the quantized weights to dense tensor, and create a map
-            # with the location of each shard in the padded tensor.
-            padded_data = torch.zeros(
-                (concat_side, padded_side), dtype=dtype, device=qweight.device
-            )
-            # (dim0_start, dim0_end, dim1_size)
-            shard_offset_map = dict[str, tuple[int, int, int]]()
-            for idx in shard_id:
-                id_in_container = shard_id_map[idx]
-                start = sum(x.size(0) for x in data_container[:id_in_container])
-                end = start + data_container[id_in_container].size(0)
-                size = data_container[id_in_container].size(1)
-                padded_data[start:end, :size] = data_container[id_in_container]
-                shard_offset_map[idx] = (start, end, size)
-            qweight.data_container.clear()
-            padded_param = Parameter(padded_data, requires_grad=False)
-            set_weight_attrs(padded_param, vars(qweight))
-            set_weight_attrs(padded_param, {"shard_offset_map": shard_offset_map})
-            layer.register_parameter("qweight", padded_param)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        shard_id = layer.qweight.shard_id
-
-        if shard_id:
-            # dequantize shard weights respectively
-            shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id
-            qweight = layer.qweight
-            result = []
-            for idx in shard_id:
-                start, end, offset = layer.qweight.shard_offset_map[idx]
-                qweight_type = layer.qweight_type.shard_weight_type[idx]
-                result.append(
-                    fused_mul_mat_gguf(
-                        x, qweight[start:end, :offset].contiguous(), qweight_type
-                    )
-                )
-            out = torch.cat(result, axis=1)
-        else:
-            qweight = layer.qweight
-            qweight_type = layer.qweight_type.weight_type
-            out = fused_mul_mat_gguf(x, qweight, qweight_type)
-        if bias is not None:
-            out.add_(bias)
-        return out
-
-
-class GGUFMoEMethod(FusedMoEMethodBase):
-    """MoE method for GGUF.
-
-    Args:
-        quant_config: The GGUF quantization config.
-    """
-
-    def __init__(
-        self,
-        quant_config: GGUFConfig,
-        moe: FusedMoEConfig,
-    ):
-        super().__init__(moe)
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        tensor_shape = (num_experts, 2 * intermediate_size_per_partition, hidden_size)
-        # gate up proj
-        w13_qweight = GGUFUninitializedParameter(requires_grad=False)
-        set_weight_attrs(
-            w13_qweight,
-            {
-                "input_dim": 1,
-                "output_dim": 0,
-                "tensor_shape": tensor_shape,
-                "is_gguf_weight": True,
-                "data_container": [],
-            },
-        )
-        set_weight_attrs(w13_qweight, extra_weight_attrs)
-        layer.register_parameter("w13_qweight", w13_qweight)
-
-        w13_qweight_type = Parameter(
-            torch.empty(1, dtype=torch.uint8), requires_grad=False
-        )
-        set_weight_attrs(
-            w13_qweight_type,
-            {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True},
-        )
-        set_weight_attrs(w13_qweight_type, extra_weight_attrs)
-        layer.register_parameter("w13_qweight_type", w13_qweight_type)
-
-        tensor_shape = (num_experts, intermediate_size_per_partition, hidden_size)
-        # gate down proj
-        w2_qweight = GGUFUninitializedParameter(requires_grad=False)
-        set_weight_attrs(
-            w2_qweight,
-            {
-                "input_dim": 1,
-                "output_dim": 0,
-                "tensor_shape": tensor_shape,
-                "is_gguf_weight": True,
-                "data_container": [],
-            },
-        )
-        set_weight_attrs(w2_qweight, extra_weight_attrs)
-        layer.register_parameter("w2_qweight", w2_qweight)
-
-        w2_qweight_type = Parameter(
-            torch.empty(1, dtype=torch.uint8), requires_grad=False
-        )
-        set_weight_attrs(
-            w2_qweight_type,
-            {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True},
-        )
-
-        set_weight_attrs(w2_qweight_type, extra_weight_attrs)
-        layer.register_parameter("w2_qweight_type", w2_qweight_type)
-
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        return None
-
-    def apply(
-        self,
-        layer: FusedMoE,
-        x: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        shared_experts_input: torch.Tensor | None,
-    ) -> torch.Tensor:
-        if layer.apply_router_weight_on_input:
-            raise NotImplementedError(
-                "Apply router weight on input is not supported for"
-                "fused GGUF MoE method."
-            )
-
-        return fused_moe_gguf(
-            x,
-            layer.w13_qweight,
-            layer.w2_qweight,
-            topk_weights,
-            topk_ids,
-            layer.w13_qweight_type.weight_type,
-            layer.w2_qweight_type.weight_type,
-            layer.activation.value,
-        )
-
-
-class GGUFEmbeddingMethod(GGUFLinearMethod):
-    """Embedding method for GGUF.
-
-    Args:
-        quant_config: The GGUF quantization config.
-    """
-
-    def embedding(self, layer: torch.nn.Module, x: torch.Tensor) -> torch.Tensor:
-        qweight = layer.qweight
-        qweight_type = layer.qweight_type.weight_type
-        hidden_size = qweight.tensor_shape[1]
-
-        return apply_gguf_embedding(
-            x, qweight, qweight_type, hidden_size, dtype=self.params_dtype
-        )
-
-
-class GGUFUninitializedParameter(UninitializedParameter):
-    cls_to_become = Parameter
-    data_container: list[torch.Tensor]
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index c4fbe0962e06..24c02a49448f 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -418,8 +418,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         output_dim = getattr(param, "output_dim", None)
         packed_dim = getattr(param, "packed_dim", None)
 
-        # If the parameter is a gguf weight, then load it directly.
-        if getattr(param, "is_gguf_weight_type", None):
+        if getattr(param, "needs_custom_weight_type", None):
             param.data.copy_(loaded_weight)
             param.weight_type = loaded_weight.item()
             return
@@ -549,8 +548,7 @@ def __init__(
 
     def tie_weights(self, embed_tokens: VocabParallelEmbedding):
         """Tie the weights with word embeddings."""
-        # GGUF quantized embed_tokens.
-        if self.quant_config and self.quant_config.get_name() == "gguf":
+        if self.quant_config and self.quant_config.should_keep_tied_lm_head():
             return embed_tokens
         else:
             self.weight = embed_tokens.weight
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 53b6b3221b54..40be772e220a 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -12,7 +12,6 @@
 from vllm.model_executor.model_loader.bitsandbytes_loader import BitsAndBytesModelLoader
 from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
 from vllm.model_executor.model_loader.dummy_loader import DummyModelLoader
-from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
 from vllm.model_executor.model_loader.runai_streamer_loader import (
     RunaiModelStreamerLoader,
 )
@@ -34,7 +33,6 @@
     "bitsandbytes",
     "dummy",
     "fastsafetensors",
-    "gguf",
     "instanttensor",
     "mistral",
     "npcache",
@@ -51,7 +49,6 @@
     "bitsandbytes": BitsAndBytesModelLoader,
     "dummy": DummyModelLoader,
     "fastsafetensors": DefaultModelLoader,
-    "gguf": GGUFModelLoader,
     "instanttensor": DefaultModelLoader,
     "mistral": DefaultModelLoader,
     "npcache": DefaultModelLoader,
@@ -149,7 +146,6 @@ def get_model(
     "register_model_loader",
     "BaseModelLoader",
     "BitsAndBytesModelLoader",
-    "GGUFModelLoader",
     "DefaultModelLoader",
     "DummyModelLoader",
     "RunaiModelStreamerLoader",
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
deleted file mode 100644
index ce6a813b8da5..000000000000
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ /dev/null
@@ -1,436 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-from collections.abc import Generator
-from typing import TYPE_CHECKING, cast
-
-import gguf
-import regex as re
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
-
-from vllm.config import ModelConfig, VllmConfig
-from vllm.config.load import LoadConfig
-from vllm.logger import init_logger
-from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model,
-    process_weights_after_loading,
-)
-from vllm.model_executor.model_loader.weight_utils import (
-    download_gguf,
-    get_gguf_extra_tensor_names,
-    get_gguf_weight_type_map,
-    gguf_quant_weights_iterator,
-    gguf_quant_weights_iterator_multi,
-)
-from vllm.transformers_utils.gguf_utils import detect_gguf_multimodal
-from vllm.utils.torch_utils import set_default_torch_dtype
-
-if TYPE_CHECKING:
-    from vllm.model_executor.layers.quantization.gguf import GGUFConfig
-
-logger = init_logger(__name__)
-
-
-class GGUFModelLoader(BaseModelLoader):
-    """
-    Model loader that can load GGUF files. This is useful for loading models
-    that are quantized with GGUF and saved in the GGUF format. This loader
-    supports loading both full models and sharded models.
-    """
-
-    def __init__(self, load_config: LoadConfig):
-        super().__init__(load_config)
-        if load_config.model_loader_extra_config:
-            raise ValueError(
-                f"Model loader extra config is not supported for "
-                f"load format {load_config.load_format}"
-            )
-
-    def _prepare_weights(self, model_config: ModelConfig):
-        model_name_or_path = model_config.model
-        if os.path.isfile(model_name_or_path):
-            return model_name_or_path
-        # repo id/filename.gguf
-        if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
-            repo_id, filename = model_name_or_path.rsplit("/", 1)
-            return hf_hub_download(repo_id=repo_id, filename=filename)
-        # repo_id:quant_type
-        elif "/" in model_name_or_path and ":" in model_name_or_path:
-            repo_id, quant_type = model_name_or_path.rsplit(":", 1)
-            return download_gguf(
-                repo_id,
-                quant_type,
-                cache_dir=self.load_config.download_dir,
-                revision=model_config.revision,
-                ignore_patterns=self.load_config.ignore_patterns,
-            )
-
-        raise ValueError(
-            f"Unrecognised GGUF reference: {model_name_or_path} "
-            "(expected local file, <repo_id>/<filename>.gguf, "
-            "or <repo_id>:<quant_type>)"
-        )
-
-    @staticmethod
-    def _get_all_gguf_files(model_path: str) -> list[str]:
-        """Discover all GGUF shard files from a single shard path.
-
-        Supports variable-width shard indices by dynamically detecting
-        the padding from the original filename.
-        E.g. ``*-00001-of-00005.gguf`` → all 5 shards,
-             ``*-01-of-15.gguf`` → all 15 shards.
-        """
-        match = re.search(r"-(\d+)-of-(\d+)\.gguf$", model_path)
-        if not match:
-            return [model_path]
-        total = int(match.group(2))
-        num_digits = len(match.group(1))
-        prefix = model_path[: match.start(1)]
-        suffix = model_path[match.end(2) :]
-        files = []
-        for i in range(1, total + 1):
-            shard_path = f"{prefix}{i:0{num_digits}d}-of-{total:0{num_digits}d}{suffix}"
-            if os.path.isfile(shard_path):
-                files.append(shard_path)
-        if files:
-            logger.info("Discovered %d GGUF shard files", len(files))
-        return files if files else [model_path]
-
-    def _get_gguf_weights_map(self, model_config: ModelConfig):
-        """
-        GGUF uses this naming convention for their tensors from HF checkpoint:
-        `blk.N.BB.weight` and `blk.N.BB.bias`
-        where N signifies the block number of a layer, and BB signifies the
-        attention/mlp layer components.
-        See "Standardized tensor names" in
-        https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
-        """
-        config = model_config.hf_config
-        # Get text config to handle both nested (multimodal) and flat
-        # (text-only) config structures. For multimodal models like
-        # Gemma3Config, this returns config.text_config. For text-only
-        # models, this returns config itself.
-        text_config = config.get_text_config()
-        model_type = config.model_type
-        is_multimodal = (
-            hasattr(config, "vision_config") and config.vision_config is not None
-        )
-        gguf_to_hf_name_map = {}
-        sideload_params: list[re.Pattern] = []
-        # hack: ggufs have a different name than transformers
-        if model_type == "cohere":
-            model_type = "command-r"
-        if model_type == "gemma3_text":
-            # Gemma3 models use "gemma3_text" in HuggingFace but
-            # "gemma3" in GGUF architecture naming
-            model_type = "gemma3"
-        if model_type in ("deepseek_v3", "deepseek_v2"):
-            model_type = "deepseek2"
-            # GGUF layer map assumes that we will have a merged expert weights
-            # so we need to map them manually
-            for idx in range(config.num_hidden_layers):
-                gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = (
-                    f"model.layers.{idx}.mlp.gate.e_score_correction_bias"
-                )
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
-                    f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
-                )
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
-                    f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
-                )
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
-                    f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
-                )
-                sideload_params.append(
-                    re.compile(
-                        f"model\\.layers\\.{idx}"
-                        r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
-                    )
-                )
-        if model_type in ("qwen2_moe", "qwen3_moe"):
-            model_type = model_type.replace("_", "")
-            # GGUF layer map assumes that we will have a merged expert weights
-            # so we need to map them manually
-            for idx in range(config.num_hidden_layers):
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
-                    f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
-                )
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
-                    f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
-                )
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
-                    f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
-                )
-                sideload_params.append(
-                    re.compile(
-                        f"model\\.layers\\.{idx}"
-                        r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
-                    )
-                )
-        if model_type == "minimax_m2":
-            model_type = "minimax-m2"
-            # GGUF layer map assumes merged expert weights
-            # map them manually like deepseek2
-            for idx in range(config.num_hidden_layers):
-                gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = (
-                    f"model.layers.{idx}.block_sparse_moe.e_score_correction_bias"
-                )
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
-                    f"model.layers.{idx}.block_sparse_moe.experts.0.w2.weight"
-                )
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
-                    f"model.layers.{idx}.block_sparse_moe.experts.0.w1.weight"
-                )
-                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
-                    f"model.layers.{idx}.block_sparse_moe.experts.0.w3.weight"
-                )
-                sideload_params.append(
-                    re.compile(
-                        f"model\\.layers\\.{idx}"
-                        r"\.block_sparse_moe\.experts\.(gate_up_proj|down_proj)"
-                    )
-                )
-
-        arch = None
-        for key, value in gguf.MODEL_ARCH_NAMES.items():
-            if value == model_type:
-                arch = key
-                break
-        if arch is None:
-            raise RuntimeError(f"Unknown gguf model_type: {model_type}")
-        text_num_layers = text_config.num_hidden_layers
-        text_name_map = gguf.get_tensor_name_map(arch, text_num_layers)
-
-        if is_multimodal:
-            mm_proj_arch = gguf.MODEL_ARCH.MMPROJ
-            vision_num_layers = config.vision_config.num_hidden_layers
-            vision_name_map = gguf.get_tensor_name_map(mm_proj_arch, vision_num_layers)
-        else:
-            vision_name_map = None
-
-        # Create dummy model to extract parameter names
-        # For multimodal: use AutoModelForImageTextToText to get
-        # language + vision + projector params
-        # For text-only: use AutoModelForCausalLM to get language model params
-        auto_cls = (
-            AutoModelForImageTextToText if is_multimodal else AutoModelForCausalLM
-        )
-        with torch.device("meta"):
-            dummy_model = auto_cls.from_config(
-                config, trust_remote_code=model_config.trust_remote_code
-            )
-
-        state_dict = dummy_model.state_dict()
-        if hf_checkpoint_map := getattr(
-            dummy_model, "_checkpoint_conversion_mapping", None
-        ):
-
-            def revert_hf_rename(name: str) -> str:
-                for original_name, hf_name in hf_checkpoint_map.items():
-                    if hf_name in name:
-                        name = name.replace(hf_name, original_name).lstrip("^")
-                return name
-
-            state_dict = {
-                revert_hf_rename(name): tensor for name, tensor in state_dict.items()
-            }
-
-        if model_type == "minimax-m2" and not hf_checkpoint_map:
-            # Reverse HF convention: mlp -> block_sparse_moe
-            state_dict = {
-                name.replace(".mlp.", ".block_sparse_moe."): tensor
-                for name, tensor in state_dict.items()
-            }
-
-        def find_hf_name_in_tensor_map(hf_name: str) -> str | None:
-            """
-            Map HuggingFace parameter name to GGUF tensor name.
-
-            This function handles the mismatch between HF parameter naming
-            conventions and gguf-py's expected format:
-            1. Strips 'model.' prefix (common in multimodal models)
-            2. Converts '_weight' suffix to '.weight' (Gemma3 compatibility)
-            3. Searches vision_name_map for multimodal parameters
-            4. Falls back to text_name_map for language model parameters
-
-            Args:
-                hf_name: Full HuggingFace parameter name (e.g.,
-                        'model.multi_modal_projector.mm_soft_emb_norm.weight')
-
-            Returns:
-                GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
-                or None if no mapping found
-            """
-            # Strip 'language_model.' prefix for multimodal models - gguf-py
-            # tensor mappings expect parameter names without this prefix.
-            # Note: 'model.' prefix should be KEPT for text-only models as
-            # gguf-py expects it.
-            if hf_name.startswith("language_model."):
-                hf_name = hf_name[15:]  # Remove 'language_model.'
-
-            # Parse parameter name and suffix
-            if hf_name.endswith((".weight", ".bias")):
-                base_name, suffix = hf_name.rsplit(".", 1)
-            else:
-                base_name, suffix = hf_name, ""
-                # Handle '_weight' suffix (Gemma3 naming: parameter ends with
-                # '_weight' instead of '.weight')
-                if base_name.endswith("_weight"):
-                    base_name = base_name[:-7]  # Remove '_weight'
-                    suffix = "weight"
-
-            gguf_name = None
-            # Priority 1: Search vision/projector parameters for multimodal models
-            if vision_name_map is not None:
-                gguf_name = vision_name_map.get_name(base_name)
-
-            # Priority 2: Search text backbone parameters
-            if gguf_name is None:
-                gguf_name = text_name_map.get_name(base_name)
-
-            if gguf_name is None:
-                return None
-
-            return gguf_name + "." + suffix
-
-        # Build mapping and track unmapped parameters
-        unmapped_params = []
-        for hf_name in state_dict:
-            gguf_name_with_suffix = find_hf_name_in_tensor_map(hf_name)
-
-            # Track mapping success
-            if gguf_name_with_suffix is not None:
-                gguf_to_hf_name_map[gguf_name_with_suffix] = hf_name
-                logger.debug("Mapped GGUF %s → HF %s", gguf_name_with_suffix, hf_name)
-            elif hf_name not in gguf_to_hf_name_map.values():
-                # Parameter not in manual overrides either
-                unmapped_params.append(hf_name)
-
-        # All parameters (except those initialized by other means) must be mapped:
-        # both vision/projector and backbone
-        if unmapped_params:
-            unmapped_params = list(
-                filter(
-                    lambda x: not any(re.fullmatch(p, x) for p in sideload_params),
-                    unmapped_params,
-                )
-            )
-        if unmapped_params:
-            raise RuntimeError(
-                f"Failed to map GGUF parameters "
-                f"({len(unmapped_params)}): "
-                f"{unmapped_params}"
-            )
-        return gguf_to_hf_name_map
-
-    def _get_gguf_weight_type(
-        self,
-        model_config: ModelConfig,
-        model_name_or_path: str,
-        gguf_to_hf_name_map: dict[str, str],
-    ) -> dict[str, str]:
-        gguf_files = self._get_all_gguf_files(model_name_or_path)
-        weight_type_map = {}
-        for f in gguf_files:
-            weight_type_map.update(get_gguf_weight_type_map(f, gguf_to_hf_name_map))
-        is_multimodal = hasattr(model_config.hf_config, "vision_config")
-        if is_multimodal:
-            mmproj_file = detect_gguf_multimodal(model_name_or_path)
-            assert mmproj_file is not None, (
-                "Could not find mm_proj file for multimodal GGUF model"
-            )
-            logger.info("Loading extra mm_proj weights from %s...", mmproj_file)
-            mm_proj_weight_type_map = get_gguf_weight_type_map(
-                mmproj_file, gguf_to_hf_name_map
-            )
-            weight_type_map.update(mm_proj_weight_type_map)
-        return weight_type_map
-
-    def _get_weights_iterator(
-        self,
-        model_config: ModelConfig,
-        model_name_or_path: str,
-        gguf_to_hf_name_map: dict[str, str],
-    ) -> Generator[tuple[str, torch.Tensor], None, None]:
-        """
-        Iterate over GGUF model weights, loading from both main model file and
-        mmproj.gguf for multimodal Gemma3 models.
-
-        For Gemma3 multimodal GGUF models:
-        - Main file (gemma-3-*.gguf): Language model weights (model.*)
-        - mmproj file (mmproj*.gguf): Vision tower + projector weights (v.*, mm.*)
-
-        Yields:
-            Tuples of (parameter_name, tensor) for all model weights
-        """
-        hf_config = model_config.hf_config
-        is_multimodal = hasattr(hf_config, "vision_config")
-
-        if is_multimodal:
-            # Load mm_proj (mm_encoder + projector) for multimodal weights
-            mmproj_file = detect_gguf_multimodal(model_name_or_path)
-            assert mmproj_file is not None, (
-                "Could not find mm_proj file for multimodal GGUF model"
-            )
-            yield from gguf_quant_weights_iterator(mmproj_file, gguf_to_hf_name_map)
-
-        gguf_files = self._get_all_gguf_files(model_name_or_path)
-        if len(gguf_files) > 1:
-            yield from gguf_quant_weights_iterator_multi(
-                gguf_files, gguf_to_hf_name_map
-            )
-        else:
-            yield from gguf_quant_weights_iterator(
-                model_name_or_path, gguf_to_hf_name_map
-            )
-
-    def download_model(self, model_config: ModelConfig) -> None:
-        self._prepare_weights(model_config)
-
-    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
-        local_model_path = self._prepare_weights(model_config)
-        gguf_weights_map = self._get_gguf_weights_map(model_config)
-        model.load_weights(
-            self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
-        )
-
-    def load_model(
-        self, vllm_config: VllmConfig, model_config: ModelConfig, prefix: str = ""
-    ) -> nn.Module:
-        device_config = vllm_config.device_config
-        local_model_path = self._prepare_weights(model_config)
-        gguf_weights_map = self._get_gguf_weights_map(model_config)
-        # we can only know if tie word embeddings after mapping weights
-        gguf_files = self._get_all_gguf_files(local_model_path)
-        all_extra_names = []
-        for f in gguf_files:
-            all_extra_names.extend(get_gguf_extra_tensor_names(f, gguf_weights_map))
-        if "lm_head.weight" in all_extra_names:
-            model_config.hf_config.update({"tie_word_embeddings": True})
-
-        weight_type_map = self._get_gguf_weight_type(
-            model_config, local_model_path, gguf_weights_map
-        )
-        # filter out unquantized modules to skip
-        unquant_names = [
-            name.removesuffix(".weight")
-            for name, weight_type in weight_type_map.items()
-            if weight_type in ("F32", "F16", "BF16") and name.endswith(".weight")
-        ]
-        logger.debug("GGUF unquantized modules: %s", unquant_names)
-        if TYPE_CHECKING:
-            vllm_config.quant_config = cast(GGUFConfig, vllm_config.quant_config)
-        vllm_config.quant_config.unquantized_modules.extend(unquant_names)
-
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config, prefix=prefix)
-            self.load_weights(model, model_config)
-
-            process_weights_after_loading(model, model_config, target_device)
-        return model
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 3b961e8e143d..45d83e2f7179 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -50,11 +50,6 @@
     runai_model_streamer = PlaceholderModule("runai_model_streamer")  # type: ignore[assignment]
     SafetensorsStreamer = runai_model_streamer.placeholder_attr("SafetensorsStreamer")
 
-try:
-    import gguf
-except ImportError:
-    gguf = PlaceholderModule("gguf")
-
 try:
     from fastsafetensors import SafeTensorsFileLoader, SingleGroup
 except ImportError:
@@ -263,9 +258,8 @@ def get_quant_config(
         raise ValueError("Model quantization method is not specified in the config.")
     quant_cls = get_quantization_config(model_config.quantization)
 
-    # GGUF doesn't have config file
-    if model_config.quantization == "gguf":
-        return quant_cls()
+    if not quant_cls.requires_hf_quant_config():
+        return quant_cls.from_config({})
 
     # Read the quantization config from the HF model config, if available.
     hf_quant_config = getattr(model_config.hf_config, "quantization_config", None)
@@ -466,52 +460,6 @@ def get_sparse_attention_config(
     return config
 
 
-def download_gguf(
-    repo_id: str,
-    quant_type: str,
-    cache_dir: str | None = None,
-    revision: str | None = None,
-    ignore_patterns: str | list[str] | None = None,
-) -> str:
-    # Use patterns that snapshot_download can handle directly
-    # Patterns to match:
-    # - *-{quant_type}.gguf (root)
-    # - *-{quant_type}-*.gguf (root sharded)
-    # - */*-{quant_type}.gguf (subdir)
-    # - */*-{quant_type}-*.gguf (subdir sharded)
-    allow_patterns = [
-        f"*-{quant_type}.gguf",
-        f"*-{quant_type}-*.gguf",
-        f"*/*-{quant_type}.gguf",
-        f"*/*-{quant_type}-*.gguf",
-    ]
-
-    # Use download_weights_from_hf which handles caching and downloading
-    folder = download_weights_from_hf(
-        model_name_or_path=repo_id,
-        cache_dir=cache_dir,
-        allow_patterns=allow_patterns,
-        revision=revision,
-        ignore_patterns=ignore_patterns,
-    )
-
-    # Find the downloaded file(s) in the folder
-    local_files = []
-    for pattern in allow_patterns:
-        # Convert pattern to glob pattern for local filesystem
-        glob_pattern = os.path.join(folder, pattern)
-        local_files.extend(glob.glob(glob_pattern))
-
-    if not local_files:
-        raise ValueError(
-            f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}"
-        )
-
-    # Sort to ensure consistent ordering (prefer non-sharded files)
-    local_files.sort(key=lambda x: (x.count("-"), x))
-    return local_files[0]
-
-
 @instrument(span_name="Download weights - HF")
 def download_weights_from_hf(
     model_name_or_path: str,
@@ -1231,118 +1179,6 @@ def _load_file(bin_file: str):
             del state
 
 
-def get_gguf_extra_tensor_names(
-    gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str]
-) -> list[str]:
-    reader = gguf.GGUFReader(gguf_file)
-    expected_gguf_keys = set(gguf_to_hf_name_map.keys())
-    exact_gguf_keys = set([tensor.name for tensor in reader.tensors])
-    extra_keys = expected_gguf_keys - exact_gguf_keys
-    return [gguf_to_hf_name_map[key] for key in extra_keys]
-
-
-def get_gguf_weight_type_map(
-    gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str]
-) -> dict[str, str]:
-    """
-    Return GGUF mapped weight's name and its quant type
-    """
-    reader = gguf.GGUFReader(gguf_file)
-    return {
-        gguf_to_hf_name_map[tensor.name]: tensor.tensor_type.name
-        for tensor in reader.tensors
-        if tensor.name in gguf_to_hf_name_map
-    }
-
-
-def gguf_quant_weights_iterator(
-    gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str]
-) -> Generator[tuple[str, torch.Tensor], None, None]:
-    """
-    Iterate over the quant weights in the model gguf files and convert
-    them to torch tensors.
-    Be careful of the order of yielding weight types and weights data,
-    we have to yield all weight types first before yielding any weights.
-    Otherwise it would cause issue when loading weights with for packed
-    layer with different quant types.
-    """
-
-    reader = gguf.GGUFReader(gguf_file)
-
-    for tensor in reader.tensors:
-        if tensor.name in gguf_to_hf_name_map:
-            weight_type = tensor.tensor_type
-            name = gguf_to_hf_name_map[tensor.name]
-
-            if weight_type.name not in ("F32", "BF16", "F16"):
-                weight_type_name = name.replace("weight", "qweight_type")
-                weight_type = torch.tensor(weight_type)
-                yield weight_type_name, weight_type
-
-    for tensor in reader.tensors:
-        if tensor.name in gguf_to_hf_name_map:
-            weight = tensor.data
-            weight_type = tensor.tensor_type
-            name = gguf_to_hf_name_map[tensor.name]
-            if weight_type.name not in ("F32", "BF16", "F16"):
-                name = name.replace("weight", "qweight")
-            if weight_type.name == "BF16" and tensor.data.dtype == np.uint8:
-                # BF16 is currently the only "quantization" type that isn't
-                # actually quantized but is read as a raw byte tensor.
-                # Reinterpret as `torch.bfloat16` tensor.
-                weight = weight.view(np.uint16)
-                if reader.byte_order == "S":
-                    # GGUF endianness != system endianness
-                    weight = weight.byteswap()
-                param = torch.tensor(weight).view(torch.bfloat16)
-            else:
-                param = torch.tensor(weight)
-            yield name, param
-
-
-def gguf_quant_weights_iterator_multi(
-    gguf_files: list[str], gguf_to_hf_name_map: dict[str, str]
-) -> Generator[tuple[str, torch.Tensor], None, None]:
-    """
-    Iterate over the quant weights across multiple GGUF shard files
-    and convert them to torch tensors.
-
-    Like gguf_quant_weights_iterator, we yield all weight types first
-    before yielding any weights data to avoid issues with packed layers
-    that have different quant types.
-    """
-    readers = [gguf.GGUFReader(f) for f in gguf_files]
-
-    # First pass: yield all weight types across all shards
-    for reader in readers:
-        for tensor in reader.tensors:
-            if tensor.name in gguf_to_hf_name_map:
-                weight_type = tensor.tensor_type
-                name = gguf_to_hf_name_map[tensor.name]
-                if weight_type.name not in ("F32", "BF16", "F16"):
-                    weight_type_name = name.replace("weight", "qweight_type")
-                    weight_type = torch.tensor(weight_type)
-                    yield weight_type_name, weight_type
-
-    # Second pass: yield all weight data across all shards
-    for reader in readers:
-        for tensor in reader.tensors:
-            if tensor.name in gguf_to_hf_name_map:
-                weight = tensor.data
-                weight_type = tensor.tensor_type
-                name = gguf_to_hf_name_map[tensor.name]
-                if weight_type.name not in ("F32", "BF16", "F16"):
-                    name = name.replace("weight", "qweight")
-                if weight_type.name == "BF16" and tensor.data.dtype == np.uint8:
-                    weight = weight.view(np.uint16)
-                    if reader.byte_order == "S":
-                        weight = weight.byteswap()
-                    param = torch.tensor(weight).view(torch.bfloat16)
-                else:
-                    param = torch.tensor(weight)
-                yield name, param
-
-
 def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
     """convert PySafeSlice object from safetensors to torch.Tensor
 
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 5905a198b289..7254f41fd2f0 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -228,9 +228,10 @@ def _init_rotary_emb(
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
-        is_gguf = quant_config and quant_config.get_name() == "gguf"
-        if is_gguf and config.model_type == "apertus":
-            is_neox_style = False
+        if quant_config is not None:
+            override = quant_config.override_is_neox_style(config.model_type)
+            if override is not None:
+                is_neox_style = override
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index b633fd285082..4a56aa676073 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -162,8 +162,10 @@ def __init__(
         )
 
         is_neox_style = True
-        if quant_config is not None and quant_config.get_name() == "gguf":
-            is_neox_style = False
+        if quant_config is not None:
+            override = quant_config.override_is_neox_style(config.model_type)
+            if override is not None:
+                is_neox_style = override
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 04708de93d39..51b317a51229 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -168,8 +168,10 @@ def __init__(
         self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
         is_neox_style = True
-        if quant_config is not None and quant_config.get_name() == "gguf":
-            is_neox_style = False
+        if quant_config is not None:
+            override = quant_config.override_is_neox_style(config.model_type)
+            if override is not None:
+                is_neox_style = override
 
         layer_idx = extract_layer_index(prefix)
         is_sliding = config.layer_types[layer_idx] == "sliding_attention"
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index b2352a3c9268..5b3f0688de4d 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -383,14 +383,10 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            # Revert +1 during llama.cpp conversion
-            # see: https://github.com/ggml-org/llama.cpp/blob/be7c3034108473beda214fd1d7c98fd6a7a3bdf5/convert_hf_to_gguf.py#L3397-L3400
-            if (
-                self.quant_config
-                and self.quant_config.get_name() == "gguf"
-                and name.endswith("norm.weight")
-            ):
-                loaded_weight -= 1
+            if self.quant_config is not None:
+                loaded_weight = self.quant_config.transform_loaded_weight(
+                    name, loaded_weight
+                )
 
             if self.quant_config is not None and (
                 scale_name := self.quant_config.get_cache_scale(name)
diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py
index 4e03eb12ee44..0f3ec5315847 100644
--- a/vllm/model_executor/models/jais2.py
+++ b/vllm/model_executor/models/jais2.py
@@ -161,8 +161,10 @@ def __init__(
         )
 
         is_neox_style = True
-        if quant_config is not None and quant_config.get_name() == "gguf":
-            is_neox_style = False
+        if quant_config is not None:
+            override = quant_config.override_is_neox_style(config.model_type)
+            if override is not None:
+                is_neox_style = override
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2ecced3df8ba..9a3456b4fe86 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -238,9 +238,10 @@ def _init_rotary_emb(
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
-        is_gguf = quant_config and quant_config.get_name() == "gguf"
-        if is_gguf and config.model_type == "llama":
-            is_neox_style = False
+        if quant_config is not None:
+            override = quant_config.override_is_neox_style(config.model_type)
+            if override is not None:
+                is_neox_style = override
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index b84b4e2ae512..2c20c0673e2e 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -240,9 +240,10 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
         is_neox_style = True
-        is_gguf = quant_config and quant_config.get_name() == "gguf"
-        if is_gguf and config.model_type == "llama":
-            is_neox_style = False
+        if quant_config is not None:
+            override = quant_config.override_is_neox_style(config.model_type)
+            if override is not None:
+                is_neox_style = override
 
         self.rotary_emb = (
             get_rope(
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index 994ae82529ab..7f84f324eb09 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -533,9 +533,10 @@ def _init_rotary_emb(
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
-        is_gguf = quant_config and quant_config.get_name() == "gguf"
-        if is_gguf and config.model_type == "PanguEmbedded":
-            is_neox_style = False
+        if quant_config is not None:
+            override = quant_config.override_is_neox_style(config.model_type)
+            if override is not None:
+                is_neox_style = override
 
         rope_parameters = config.rope_parameters or {}
         if rope_parameters is not None and rope_parameters.get(
@@ -732,14 +733,16 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
         # no need to narrow
         is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
-        # Special case for GGUF
-        is_gguf_weight = getattr(param, "is_gguf_weight", False)
-        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
-        if is_gguf_weight_type:
+        needs_custom_weight_materialization = getattr(
+            param, "needs_custom_weight_materialization", False
+        )
+        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
+        if needs_custom_weight_type:
             param.weight_type = loaded_weight.item()
 
-        # Materialize GGUF UninitializedParameter
-        if is_gguf_weight and isinstance(param, nn.UninitializedParameter):
+        if needs_custom_weight_materialization and isinstance(
+            param, nn.UninitializedParameter
+        ):
             final_shape = list(loaded_weight.shape)
             if output_dim is not None:
                 assert final_shape[output_dim] % self.tp_size == 0
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index ce3a260d0ef6..c50d737878f5 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -968,20 +968,9 @@ def maybe_swap_ffn_param(
     params_dict: dict[str, torch.Tensor],
     quant_config: QuantizationConfig,
 ) -> torch.Tensor:
-    if not (quant_config and quant_config.get_name() == "gguf") or ".fc" not in name:
+    if quant_config is None or ".fc" not in name:
         return param
-    # Some GGUF models have fc1 and fc2 weights swapped
-    tp_size = get_tensor_model_parallel_world_size()
-    output_dim = getattr(param, "output_dim", 0)
-    output_size = param.size(output_dim) * tp_size
-    weight_out_size = loaded_weight.size(output_dim)
-    if ".fc1." in name and output_size != weight_out_size:
-        new_name = name.replace(".fc1.", ".fc2.")
-        param = params_dict[new_name]
-    elif ".fc2." in name and output_size != weight_out_size:
-        new_name = name.replace(".fc2.", ".fc1.")
-        param = params_dict[new_name]
-    return param
+    return quant_config.remap_loaded_parameter(name, param, loaded_weight, params_dict)
 
 
 # Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200
diff --git a/vllm/model_format.py b/vllm/model_format.py
new file mode 100644
index 000000000000..cb77983a9470
--- /dev/null
+++ b/vllm/model_format.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+logger = init_logger(__name__)
+
+
+class ModelFormatHandler:
+    """Extension hook for out-of-tree model formats.
+
+    Handlers can customize how a model reference is interpreted across vLLM,
+    such as model/config discovery, tokenizer and processor resolution, and
+    engine-arg defaults.
+    """
+
+    name: str = ""
+
+    def matches(self, model: str | Path | None) -> bool:
+        return False
+
+    def update_engine_args(self, engine_args: Any) -> None:
+        return
+
+    def prepare_hf_config_load(
+        self,
+        model: str | Path,
+        revision: str | None = None,
+        kwargs: dict[str, Any] | None = None,
+    ) -> tuple[str | Path, dict[str, Any]]:
+        return model, kwargs or {}
+
+    def should_use_hf_config_parser(
+        self,
+        original_model: str | Path,
+        resolved_model: str | Path,
+    ) -> bool:
+        return False
+
+    def get_missing_hf_config_error(
+        self,
+        original_model: str | Path,
+        resolved_model: str | Path,
+    ) -> str | None:
+        return None
+
+    def patch_parsed_hf_config(
+        self,
+        original_model: str | Path,
+        config_dict: dict[str, Any],
+        config: "PretrainedConfig",
+    ) -> "PretrainedConfig":
+        return config
+
+    def patch_model_hf_config(
+        self,
+        original_model: str | Path,
+        hf_config: "PretrainedConfig",
+    ) -> "PretrainedConfig":
+        return hf_config
+
+    def resolve_tokenizer_init(
+        self,
+        tokenizer_name: str | Path,
+        *args: Any,
+        revision: str | None = None,
+        runner_type: str = "generate",
+        tokenizer_mode: str = "auto",
+        **kwargs: Any,
+    ) -> tuple[str | Path, tuple[Any, ...], dict[str, Any]]:
+        return tokenizer_name, args, kwargs
+
+    def resolve_processor_source(
+        self,
+        model_config: Any,
+        component: str,
+    ) -> tuple[str | Path, str | None]:
+        return model_config.model, model_config.revision
+
+    def validate_model_config(self, model_config: Any) -> None:
+        return
+
+    def resolve_sentence_transformer_source(
+        self,
+        model: str | Path,
+        revision: str | None = None,
+    ) -> str | Path:
+        return model
+
+    def resolve_image_processor_source(
+        self,
+        model: str | Path,
+        revision: str | None = None,
+    ) -> str | Path:
+        return model
+
+    def should_skip_generation_config(self, model: str | Path) -> bool:
+        return False
+
+
+_MODEL_FORMAT_HANDLERS: list[ModelFormatHandler] = []
+
+
+def register_model_format(handler: ModelFormatHandler) -> ModelFormatHandler:
+    if not isinstance(handler, ModelFormatHandler):
+        raise ValueError("The model format handler must subclass `ModelFormatHandler`.")
+
+    replaced = False
+    if handler.name:
+        for idx, existing in enumerate(_MODEL_FORMAT_HANDLERS):
+            if existing.name == handler.name:
+                logger.warning(
+                    "The model format handler %r already exists and will be "
+                    "overwritten by %s.",
+                    handler.name,
+                    type(handler),
+                )
+                _MODEL_FORMAT_HANDLERS[idx] = handler
+                replaced = True
+                break
+
+    if not replaced:
+        _MODEL_FORMAT_HANDLERS.append(handler)
+
+    return handler
+
+
+def get_model_format_handler(model: str | Path | None) -> ModelFormatHandler | None:
+    for handler in reversed(_MODEL_FORMAT_HANDLERS):
+        if handler.matches(model):
+            return handler
+    return None
+
+
+def prepare_hf_model_reference(
+    model: str | Path,
+    revision: str | None = None,
+    **kwargs: Any,
+) -> tuple[ModelFormatHandler | None, str | Path, dict[str, Any]]:
+    handler = get_model_format_handler(model)
+    if handler is None:
+        return None, model, kwargs
+    resolved_model, resolved_kwargs = handler.prepare_hf_config_load(
+        model,
+        revision=revision,
+        kwargs=kwargs,
+    )
+    return handler, resolved_model, resolved_kwargs
+
+
+__all__ = [
+    "ModelFormatHandler",
+    "get_model_format_handler",
+    "prepare_hf_model_reference",
+    "register_model_format",
+]
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index bb525d9251f4..c3d9d49428fe 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -409,7 +409,6 @@ class RocmPlatform(Platform):
         "fp8",
         "compressed-tensors",
         "fbgemm_fp8",
-        "gguf",
         "quark",
         "mxfp4",
         "torchao",
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 7d48e3c6ff91..c4ee1ad9a310 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -10,13 +10,7 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.gguf_utils import (
-    check_gguf_file,
-    get_gguf_file_path_from_hf,
-    is_gguf,
-    is_remote_gguf,
-    split_remote_gguf,
-)
+from vllm.model_format import get_model_format_handler
 from vllm.transformers_utils.repo_utils import (
     any_pattern_in_repo_files,
     is_mistral_model_repo,
@@ -115,20 +109,15 @@ def resolve_tokenizer_args(
                 )
                 tokenizer_name = tokenizer_path
 
-    # Separate model folder from file path for GGUF models
-    if is_gguf(tokenizer_name):
-        if check_gguf_file(tokenizer_name):
-            kwargs["gguf_file"] = Path(tokenizer_name).name
-            tokenizer_name = Path(tokenizer_name).parent
-        elif is_remote_gguf(tokenizer_name):
-            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
-            # Get the HuggingFace Hub path for the GGUF file
-            gguf_file = get_gguf_file_path_from_hf(
-                tokenizer_name,
-                quant_type,
-                revision=revision,
-            )
-            kwargs["gguf_file"] = gguf_file
+    if handler := get_model_format_handler(tokenizer_name):
+        tokenizer_name, args, kwargs = handler.resolve_tokenizer_init(
+            tokenizer_name,
+            *args,
+            revision=revision,
+            runner_type=runner_type,
+            tokenizer_mode=tokenizer_mode,
+            **kwargs,
+        )
 
     if "truncation_side" not in kwargs:
         if runner_type == "generate" or runner_type == "draft":
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 5f4b5a3b2a48..fa8beacfebcf 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -18,7 +18,6 @@
 from transformers import GenerationConfig, PretrainedConfig
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_MAPPING_NAMES,
 )
 from transformers.models.auto.tokenization_auto import get_tokenizer_config
@@ -26,6 +25,10 @@
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.model_format import (
+    get_model_format_handler,
+    prepare_hf_model_reference,
+)
 from vllm.transformers_utils.repo_utils import is_mistral_model_repo
 from vllm.transformers_utils.utils import (
     parse_safetensors_file_metadata,
@@ -34,12 +37,6 @@
 from vllm.utils.torch_utils import common_broadcastable_dtype
 
 from .config_parser_base import ConfigParserBase
-from .gguf_utils import (
-    check_gguf_file,
-    is_gguf,
-    is_remote_gguf,
-    split_remote_gguf,
-)
 from .repo_utils import (
     file_or_path_exists,
     get_hf_file_to_dict,
@@ -573,20 +570,17 @@ def maybe_override_with_speculators(
     Returns:
         Tuple of (resolved_model, resolved_tokenizer, speculative_config)
     """
-    if check_gguf_file(model):
-        kwargs["gguf_file"] = Path(model).name
-        gguf_model_repo = Path(model).parent
-    elif is_remote_gguf(model):
-        repo_id, _ = split_remote_gguf(model)
-        gguf_model_repo = Path(repo_id)
-    else:
-        gguf_model_repo = None
     kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
+    _, resolved_model, resolved_kwargs = prepare_hf_model_reference(
+        model,
+        revision=revision,
+        **kwargs,
+    )
     config_dict, _ = PretrainedConfig.get_config_dict(
-        model if gguf_model_repo is None else gguf_model_repo,
+        resolved_model,
         revision=revision,
         token=hf_token,
-        **without_trust_remote_code(kwargs),
+        **without_trust_remote_code(resolved_kwargs),
     )
     speculators_config = config_dict.get("speculators_config")
 
@@ -621,23 +615,20 @@ def get_config(
     hf_overrides_fn: Callable[[PretrainedConfig], PretrainedConfig] | None = None,
     **kwargs,
 ) -> PretrainedConfig:
-    # Separate model folder from file path for GGUF models
-
-    _is_gguf = is_gguf(model)
-    _is_remote_gguf = is_remote_gguf(model)
-    if _is_gguf:
-        if check_gguf_file(model):
-            # Local GGUF file
-            kwargs["gguf_file"] = Path(model).name
-            model = Path(model).parent
-        elif _is_remote_gguf:
-            # Remote GGUF - extract repo_id from repo_id:quant_type format
-            # The actual GGUF file will be downloaded later by GGUFModelLoader
-            # Keep model as repo_id:quant_type for download, but use repo_id for config
-            model, _ = split_remote_gguf(model)
+    original_model = model
+    handler, model, kwargs = prepare_hf_model_reference(
+        model,
+        revision=revision,
+        **kwargs,
+    )
 
     if config_format == "auto":
         try:
+            use_hf_parser_without_config = (
+                handler.should_use_hf_config_parser(original_model, model)
+                if handler is not None
+                else False
+            )
             # First check for Mistral to avoid defaulting to
             # Transformers implementation.
             if is_mistral_model_repo(
@@ -646,26 +637,22 @@ def get_config(
                 model=model, config_name=MISTRAL_CONFIG_NAME, revision=revision
             ):
                 config_format = "mistral"
-            elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists(
+            elif use_hf_parser_without_config or file_or_path_exists(
                 model, HF_CONFIG_NAME, revision=revision
             ):
                 config_format = "hf"
-            # Remote GGUF models must have config.json in repo,
-            # otherwise the config can't be parsed correctly.
-            # FIXME(Isotr0py): Support remote GGUF repos without config.json
-            elif _is_remote_gguf and not file_or_path_exists(
-                model, HF_CONFIG_NAME, revision=revision
-            ):
-                err_msg = (
-                    "Could not find config.json for remote GGUF model repo. "
-                    "To load remote GGUF model through `<repo_id>:<quant_type>`, "
-                    "ensure your model has config.json (HF format) file. "
-                    "Otherwise please specify --hf-config-path <original_repo> "
-                    "in engine args to fetch config from unquantized hf model."
-                )
-                logger.error(err_msg)
-                raise ValueError(err_msg)
             else:
+                if (
+                    handler is not None
+                    and (
+                        err_msg := handler.get_missing_hf_config_error(
+                            original_model, model
+                        )
+                    )
+                    is not None
+                ):
+                    logger.error(err_msg)
+                    raise ValueError(err_msg)
                 raise ValueError(
                     "Could not detect config format for no config file found. "
                     "With config_format 'auto', ensure your model has either "
@@ -685,7 +672,7 @@ def get_config(
                 "'config.json'.\n"
                 "   - For Mistral models: ensure the presence of a "
                 "'params.json'.\n"
-            ).format(model=model)
+            ).format(model=original_model)
 
             raise ValueError(error_message) from e
 
@@ -698,34 +685,8 @@ def get_config(
         hf_overrides=hf_overrides_kw or hf_overrides_fn,
         **kwargs,
     )
-
-    # Patching defaults for GGUF models
-    if _is_gguf:
-        # Some models have different default values between GGUF and HF.
-        def apply_gguf_default(key: str, gguf_default: Any):
-            """
-            Apply GGUF defaults unless explicitly configured.
-
-            This function reads/writes external `config` and `config_dict`.
-            If the specified `key` is not in `config_dict` (i.e. not explicitly
-            configured and the default HF value is used), it updates the
-            corresponding `config` value to `gguf_default`.
-            """
-            if key not in config_dict:
-                config.update({key: gguf_default})
-
-        # Apply architecture-specific GGUF defaults.
-        if config.model_type in {"qwen3_moe"}:
-            # Qwen3 MoE: norm_topk_prob is always true.
-            # Note that, this parameter is always false (HF default) on Qwen2 MoE.
-            apply_gguf_default("norm_topk_prob", True)
-
-    # Special architecture mapping check for GGUF models
-    if _is_gguf:
-        if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-            raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
-        model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
-        config.update({"architectures": [model_type]})
+    if handler is not None:
+        config = handler.patch_parsed_hf_config(original_model, config_dict, config)
 
     # Architecture mapping for models without explicit architectures field
     if not config.architectures:
@@ -818,8 +779,8 @@ def get_pooling_config(
         A dictionary containing the pooling type and whether
             normalization is used, or None if no pooling configuration is found.
     """
-    if is_remote_gguf(model):
-        model, _ = split_remote_gguf(model)
+    if handler := get_model_format_handler(model):
+        model = handler.resolve_sentence_transformer_source(model, revision)
 
     modules_file_name = "modules.json"
 
@@ -908,6 +869,9 @@ def get_sentence_transformer_tokenizer_config(
     - dict: A dictionary containing the configuration parameters
     for the Sentence Transformer BERT model.
     """
+    if handler := get_model_format_handler(model):
+        model = handler.resolve_sentence_transformer_source(model, revision)
+
     sentence_transformer_config_files = [
         "sentence_bert_config.json",
         "sentence_roberta_config.json",
@@ -1036,11 +1000,8 @@ def get_hf_image_processor_config(
     # ModelScope does not provide an interface for image_processor
     if envs.VLLM_USE_MODELSCOPE:
         return dict()
-    # Separate model folder from file path for GGUF models
-    if check_gguf_file(model):
-        model = Path(model).parent
-    elif is_remote_gguf(model):
-        model, _ = split_remote_gguf(model)
+    if handler := get_model_format_handler(model):
+        model = handler.resolve_image_processor_source(model, revision)
     return get_image_processor_config(
         model, token=hf_token, revision=revision, **kwargs
     )
@@ -1070,12 +1031,9 @@ def try_get_generation_config(
     config_format: str | ConfigFormat = "auto",
     hf_token: bool | str | None = None,
 ) -> GenerationConfig | None:
-    # GGUF files don't have generation_config.json - their config is embedded
-    # in the file header. Skip all filesystem lookups to avoid re-reading the
-    # memory-mapped file, which can hang in multi-process scenarios when the
-    # EngineCore process already has the file mapped.
-    if is_gguf(model):
-        return None
+    if handler := get_model_format_handler(model):
+        if handler.should_skip_generation_config(model):
+            return None
 
     try:
         return GenerationConfig.from_pretrained(
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
deleted file mode 100644
index 7708378ee13b..000000000000
--- a/vllm/transformers_utils/gguf_utils.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""GGUF utility functions."""
-
-from functools import cache
-from os import PathLike
-from pathlib import Path
-
-import gguf
-import regex as re
-from gguf.constants import Keys, VisionProjectorType
-from gguf.quants import GGMLQuantizationType
-from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
-
-from vllm.logger import init_logger
-
-from .repo_utils import list_filtered_repo_files
-
-logger = init_logger(__name__)
-
-
-@cache
-def check_gguf_file(model: str | PathLike) -> bool:
-    """Check if the file is a GGUF model."""
-    model = Path(model)
-    if not model.is_file():
-        return False
-    elif model.suffix == ".gguf":
-        return True
-
-    try:
-        with model.open("rb") as f:
-            header = f.read(4)
-
-        return header == b"GGUF"
-    except Exception as e:
-        logger.debug("Error reading file %s: %s", model, e)
-        return False
-
-
-@cache
-def is_remote_gguf(model: str | Path) -> bool:
-    """Check if the model is a remote GGUF model.
-
-    Recognizes two forms:
-    1. Standard: ``repo_id:quant_type`` where *quant_type* is a known
-       GGML quantization type (e.g. ``Q4_K_M``).
-    2. Non-standard: ``repo_id:quant_type`` where *quant_type* contains
-       a known GGML type with extra prefixes (e.g. ``UD-Q4_K_XL``).
-       A warning is logged and actual file existence is validated later
-       during download.
-    """
-    pattern = r"^[a-zA-Z0-9][a-zA-Z0-9._-]*/[a-zA-Z0-9][a-zA-Z0-9._-]*:[A-Za-z0-9_+-]+$"
-    model = str(model)
-    if re.fullmatch(pattern, model):
-        _, quant_type = model.rsplit(":", 1)
-        if is_valid_gguf_quant_type(quant_type):
-            return True
-        if is_nonstandard_gguf_quant_type(quant_type):
-            logger.warning(
-                "Non-standard GGUF quant type '%s' detected.",
-                quant_type,
-            )
-            return True
-    return False
-
-
-def is_nonstandard_gguf_quant_type(quant_type: str) -> bool:
-    """Check if a non-standard quant type contains a known GGML type.
-
-    Splits the quant type by the last ``-`` and checks whether the
-    trailing part is a standard GGML type.  For example::
-
-        UD-Q4_K_XL      → rsplit → ["UD", "Q4_K_XL"]      → Q4_K_XL valid ✓
-        UD-IQ4_NL       → rsplit → ["UD", "IQ4_NL"]       → IQ4_NL  valid ✓
-        Custom-UD-Q4_K  → rsplit → ["Custom-UD", "Q4_K"]  → Q4_K    valid ✓
-        RANDOM          → no "-" → False
-    """
-    if "-" not in quant_type:
-        return False
-    _, remainder = quant_type.rsplit("-", 1)
-    return is_valid_gguf_quant_type(remainder)
-
-
-# Common suffixes used in GGUF file naming conventions
-# e.g., Q4_K_M, Q3_K_S, Q5_K_L, Q2_K_XL
-_GGUF_QUANT_SUFFIXES = ("_M", "_S", "_L", "_XL", "_XS", "_XXS")
-
-
-def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
-    """Check if the quant type is a valid GGUF quant type.
-
-    Supports both exact GGML quant types (e.g., Q4_K, IQ1_S) and
-    extended naming conventions (e.g., Q4_K_M, Q3_K_S, Q5_K_L).
-    """
-    # Check for exact match first
-    if getattr(GGMLQuantizationType, gguf_quant_type, None) is not None:
-        return True
-
-    # Check for extended naming conventions (e.g., Q4_K_M -> Q4_K)
-    for suffix in _GGUF_QUANT_SUFFIXES:
-        if gguf_quant_type.endswith(suffix):
-            base_type = gguf_quant_type[: -len(suffix)]
-            if getattr(GGMLQuantizationType, base_type, None) is not None:
-                return True
-
-    return False
-
-
-def split_remote_gguf(model: str | Path) -> tuple[str, str]:
-    """Split the model into repo_id and quant type."""
-    model = str(model)
-    if is_remote_gguf(model):
-        parts = model.rsplit(":", 1)
-        return (parts[0], parts[1])
-    raise ValueError(
-        f"Wrong GGUF model or invalid GGUF quant type: {model}.\n"
-        "- It should be in repo_id:quant_type format.\n"
-        f"- Valid base quant types: {GGMLQuantizationType._member_names_}\n"
-        f"- Extended suffixes also supported: {_GGUF_QUANT_SUFFIXES}\n"
-        "- Non-standard GGUF quant types also supported: "
-        "dash-separated prefixes (e.g. UD-Q4_K_XL, Custom-Q8_0)",
-    )
-
-
-def is_gguf(model: str | Path) -> bool:
-    """Check if the model is a GGUF model.
-
-    Args:
-        model: Model name, path, or Path object to check.
-
-    Returns:
-        True if the model is a GGUF model, False otherwise.
-    """
-    model = str(model)
-
-    # Check if it's a local GGUF file
-    if check_gguf_file(model):
-        return True
-
-    # Check if it's a remote GGUF model (repo_id:quant_type format)
-    return is_remote_gguf(model)
-
-
-def detect_gguf_multimodal(model: str) -> Path | None:
-    """Check if GGUF model has multimodal projector file.
-
-    Args:
-        model: Model path string
-
-    Returns:
-        Path to mmproj file if found, None otherwise
-    """
-    if not model.endswith(".gguf"):
-        return None
-
-    try:
-        model_path = Path(model)
-        if not model_path.is_file():
-            return None
-
-        model_dir = model_path.parent
-        mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"]
-        for pattern in mmproj_patterns:
-            mmproj_files = list(model_dir.glob(pattern))
-            if mmproj_files:
-                return mmproj_files[0]
-        return None
-    except Exception:
-        return None
-
-
-def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None":
-    """Extract vision config parameters from mmproj.gguf metadata.
-
-    Reads vision encoder configuration from GGUF metadata fields using
-    standardized GGUF constants. Automatically detects the projector type
-    (e.g., gemma3, llama4) and applies model-specific parameters accordingly.
-
-    The function extracts standard CLIP vision parameters from GGUF metadata
-    and applies projector-type-specific customizations. For unknown projector
-    types, it uses safe defaults from SiglipVisionConfig.
-
-    Args:
-        mmproj_path: Path to mmproj.gguf file (str or Path)
-
-    Returns:
-        SiglipVisionConfig if extraction succeeds, None if any required
-        field is missing from the GGUF metadata
-
-    Raises:
-        Exception: Exceptions from GGUF reading (file not found, corrupted
-            file, etc.) propagate directly from gguf.GGUFReader
-    """
-    reader = gguf.GGUFReader(str(mmproj_path))
-
-    # Detect projector type to apply model-specific parameters
-    projector_type = None
-    projector_type_field = reader.get_field(Keys.Clip.PROJECTOR_TYPE)
-    if projector_type_field:
-        try:
-            projector_type = bytes(projector_type_field.parts[-1]).decode("utf-8")
-        except (AttributeError, UnicodeDecodeError) as e:
-            logger.warning("Failed to decode projector type from GGUF: %s", e)
-
-    # Map GGUF field constants to SiglipVisionConfig parameters.
-    # Uses official GGUF constants from gguf-py for standardization.
-    # Format: {gguf_constant: (param_name, dtype)}
-    VISION_CONFIG_FIELDS = {
-        Keys.ClipVision.EMBEDDING_LENGTH: ("hidden_size", int),
-        Keys.ClipVision.FEED_FORWARD_LENGTH: ("intermediate_size", int),
-        Keys.ClipVision.BLOCK_COUNT: ("num_hidden_layers", int),
-        Keys.ClipVision.Attention.HEAD_COUNT: ("num_attention_heads", int),
-        Keys.ClipVision.IMAGE_SIZE: ("image_size", int),
-        Keys.ClipVision.PATCH_SIZE: ("patch_size", int),
-        Keys.ClipVision.Attention.LAYERNORM_EPS: ("layer_norm_eps", float),
-    }
-
-    # Extract and validate all required fields
-    config_params = {}
-    for gguf_key, (param_name, dtype) in VISION_CONFIG_FIELDS.items():
-        field = reader.get_field(gguf_key)
-        if field is None:
-            logger.warning(
-                "Missing required vision config field '%s' in mmproj.gguf",
-                gguf_key,
-            )
-            return None
-        # Extract scalar value from GGUF field and convert to target type
-        config_params[param_name] = dtype(field.parts[-1])
-
-    # Apply model-specific parameters based on projector type
-    if projector_type == VisionProjectorType.GEMMA3:
-        # Gemma3 doesn't use the vision pooling head (multihead attention)
-        # This is a vLLM-specific parameter used in SiglipVisionTransformer
-        config_params["vision_use_head"] = False
-        logger.info("Detected Gemma3 projector, disabling vision pooling head")
-    # Add other projector-type-specific customizations here as needed
-    # elif projector_type == VisionProjectorType.LLAMA4:
-    #     config_params["vision_use_head"] = ...
-
-    # Create config with extracted parameters
-    # Note: num_channels and attention_dropout use SiglipVisionConfig defaults
-    # (3 and 0.0 respectively) which are correct for all models
-    config = SiglipVisionConfig(**config_params)
-
-    if projector_type:
-        logger.info(
-            "Extracted vision config from mmproj.gguf (projector_type: %s)",
-            projector_type,
-        )
-    else:
-        logger.info("Extracted vision config from mmproj.gguf metadata")
-
-    return config
-
-
-def maybe_patch_hf_config_from_gguf(
-    model: str,
-    hf_config: PretrainedConfig,
-) -> PretrainedConfig:
-    """Patch HF config for GGUF models.
-
-    Applies GGUF-specific patches to HuggingFace config:
-    1. For multimodal models: patches architecture and vision config
-    2. For all GGUF models: overrides vocab_size from embedding tensor
-
-    This ensures compatibility with GGUF models that have extended
-    vocabularies (e.g., Unsloth) where the GGUF file contains more
-    tokens than the HuggingFace tokenizer config specifies.
-
-    Args:
-        model: Model path string
-        hf_config: HuggingFace config to patch in-place
-
-    Returns:
-        Updated HuggingFace config
-    """
-    # Patch multimodal config if mmproj.gguf exists
-    mmproj_path = detect_gguf_multimodal(model)
-    if mmproj_path is not None:
-        vision_config = extract_vision_config_from_gguf(str(mmproj_path))
-
-        # Create HF config for Gemma3 multimodal
-        text_config = hf_config.get_text_config()
-        is_gemma3 = hf_config.model_type in ("gemma3", "gemma3_text")
-        if vision_config is not None and is_gemma3:
-            new_hf_config = Gemma3Config(
-                text_config=text_config,
-                vision_config=vision_config,
-                architectures=["Gemma3ForConditionalGeneration"],
-            )
-            hf_config = new_hf_config
-
-    return hf_config
-
-
-def get_gguf_file_path_from_hf(
-    repo_id: str | Path,
-    quant_type: str,
-    revision: str | None = None,
-) -> str:
-    """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
-
-    Args:
-        repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
-        quant_type: The quantization type (e.g., "Q4_K_M", "F16")
-        revision: Optional revision/branch name
-
-    Returns:
-        The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
-    """
-    repo_id = str(repo_id)
-    gguf_patterns = [
-        f"*-{quant_type}.gguf",
-        f"*-{quant_type}-*.gguf",
-        f"*/*-{quant_type}.gguf",
-        f"*/*-{quant_type}-*.gguf",
-    ]
-    matching_files = list_filtered_repo_files(
-        repo_id,
-        allow_patterns=gguf_patterns,
-        revision=revision,
-    )
-
-    if len(matching_files) == 0:
-        raise ValueError(
-            "Could not find GGUF file for repo %s with quantization %s.",
-            repo_id,
-            quant_type,
-        )
-
-    # Sort to ensure consistent ordering (prefer non-sharded files)
-    matching_files.sort(key=lambda x: (x.count("-"), x))
-    gguf_filename = matching_files[0]
-    return gguf_filename
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 0e241f6abfd1..ed9d764cdc43 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -24,8 +24,8 @@
 from typing_extensions import TypeVar
 
 from vllm.logger import init_logger
+from vllm.model_format import get_model_format_handler
 from vllm.transformers_utils import processors
-from vllm.transformers_utils.gguf_utils import is_gguf
 from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
 from vllm.transformers_utils.utils import convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
@@ -341,13 +341,8 @@ def cached_processor_from_config(
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
-    if is_gguf(model_config.model):
-        assert not is_gguf(model_config.tokenizer), (
-            "For multimodal GGUF models, the original tokenizer "
-            "should be used to correctly load processor."
-        )
-        model = model_config.tokenizer
-        revision = model_config.tokenizer_revision
+    if handler := get_model_format_handler(model_config.model):
+        model, revision = handler.resolve_processor_source(model_config, "processor")
     else:
         model = model_config.model
         revision = model_config.revision
@@ -455,13 +450,10 @@ def cached_image_processor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    if is_gguf(model_config.model):
-        assert not is_gguf(model_config.tokenizer), (
-            "For multimodal GGUF models, the original tokenizer "
-            "should be used to correctly load image processor."
+    if handler := get_model_format_handler(model_config.model):
+        model, revision = handler.resolve_processor_source(
+            model_config, "image_processor"
         )
-        model = model_config.tokenizer
-        revision = model_config.tokenizer_revision
     else:
         model = model_config.model
         revision = model_config.revision
diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py
index 91629cb57816..d76a54aae377 100644
--- a/vllm/v1/metrics/perf.py
+++ b/vllm/v1/metrics/perf.py
@@ -66,7 +66,6 @@ class InvalidComponent(Exception):
     "bitsandbytes": 0.5,
     "modelopt_fp4": 0.5,
     "petit_nvfp4": 0.5,
-    "gguf": 0.5,
     "compressed-tensors": 0.5,
     "torchao": 0.5,
     "quark": 0.5,

From 642e1ecc99fc5869ef44b7f21daac8cc2c86d535 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 12 Apr 2026 14:44:11 +0800
Subject: [PATCH 02/21] remove gguf materialization

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/linear.py | 103 +--------------------------
 1 file changed, 1 insertion(+), 102 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 297a10bd3942..e6da811b3885 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -5,7 +5,7 @@
 from abc import abstractmethod
 
 import torch
-from torch.nn.parameter import Parameter, UninitializedParameter
+from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
 from vllm.distributed import (
@@ -358,20 +358,6 @@ def __init__(
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        # If the weight on disk does not have a shape, give it one
-        # (such scales for AutoFp8).
-        needs_custom_weight_materialization = getattr(
-            param, "needs_custom_weight_materialization", False
-        )
-        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
-        if needs_custom_weight_type:
-            param.weight_type = loaded_weight.item()
-
-        if needs_custom_weight_materialization and isinstance(
-            param, UninitializedParameter
-        ):
-            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
-
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
@@ -535,22 +521,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # no need to narrow
         is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
-        needs_custom_weight_materialization = getattr(
-            param, "needs_custom_weight_materialization", False
-        )
-        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
-        if needs_custom_weight_type:
-            param.weight_type = loaded_weight.item()
-
-        if needs_custom_weight_materialization and isinstance(
-            param, UninitializedParameter
-        ):
-            final_shape = list(loaded_weight.shape)
-            if output_dim is not None:
-                assert final_shape[output_dim] % self.tp_size == 0
-                final_shape[output_dim] = final_shape[output_dim] // self.tp_size
-            param.materialize(final_shape, dtype=loaded_weight.dtype)
-
         param_data = param.data
         if output_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[output_dim]
@@ -695,38 +665,6 @@ def weight_loader(
         loaded_shard_id: tuple[int, ...] | int | None = None,
     ):
         self.validate_shard_id(loaded_shard_id)
-        needs_custom_weight_materialization = getattr(
-            param, "needs_custom_weight_materialization", False
-        )
-        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
-        if isinstance(loaded_shard_id, tuple) and (
-            needs_custom_weight_materialization or needs_custom_weight_type
-        ):
-            raise NotImplementedError(
-                "Shard id with multiple indices is not supported for this "
-                "format-specific weight loader."
-            )
-        if needs_custom_weight_type:
-            if loaded_shard_id is not None:
-                param.data[loaded_shard_id].copy_(loaded_weight)
-                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
-            else:
-                param.shard_weight_type = {
-                    i: loaded_weight.item() for i, _ in enumerate(self.output_sizes)
-                }
-            return
-
-        if needs_custom_weight_materialization:
-            output_dim = getattr(param, "output_dim", None)
-            shard_size = loaded_weight.size(output_dim) // self.tp_size
-            start_idx = self.tp_rank * shard_size
-
-            if loaded_shard_id is not None:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
-                param.shard_id.append(loaded_shard_id)
-                param.shard_id_map[loaded_shard_id] = len(param.data_container)
-                param.data_container.append(loaded_weight)
-                return
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
@@ -1172,30 +1110,6 @@ def weight_loader(
         loaded_shard_id: str | None = None,
     ):
         self.validate_shard_id(loaded_shard_id)
-        needs_custom_weight_materialization = getattr(
-            param, "needs_custom_weight_materialization", False
-        )
-        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
-        if needs_custom_weight_type:
-            idx_map = {"q": 0, "k": 1, "v": 2}
-            if loaded_shard_id is not None:
-                param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
-                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
-            else:
-                param.shard_weight_type = {k: loaded_weight.item() for k in idx_map}
-            return
-
-        if needs_custom_weight_materialization:
-            output_dim = getattr(param, "output_dim", None)
-            shard_size = loaded_weight.size(output_dim) // self.tp_size
-            start_idx = self.tp_rank * shard_size
-
-            if loaded_shard_id is not None:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
-                param.shard_id.append(loaded_shard_id)
-                param.shard_id_map[loaded_shard_id] = len(param.data_container)
-                param.data_container.append(loaded_weight)
-                return
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
@@ -1484,21 +1398,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # no need to narrow
         is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
-        needs_custom_weight_materialization = getattr(
-            param, "needs_custom_weight_materialization", False
-        )
-        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
-        if needs_custom_weight_type:
-            param.weight_type = loaded_weight.item()
-
-        if needs_custom_weight_materialization and isinstance(
-            param, UninitializedParameter
-        ):
-            weight_shape = list(loaded_weight.shape)
-            if input_dim:
-                weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size
-            param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
-
         param_data = param.data
         if input_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[input_dim]

From 1b53ba7b2827ec51c514bea6e61c065a8b98fc21 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 12 Apr 2026 14:44:32 +0800
Subject: [PATCH 03/21] clean

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/tokenizers/registry.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index c4ee1ad9a310..7a963d570d9b 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -10,7 +10,6 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.model_format import get_model_format_handler
 from vllm.transformers_utils.repo_utils import (
     any_pattern_in_repo_files,
     is_mistral_model_repo,
@@ -109,16 +108,6 @@ def resolve_tokenizer_args(
                 )
                 tokenizer_name = tokenizer_path
 
-    if handler := get_model_format_handler(tokenizer_name):
-        tokenizer_name, args, kwargs = handler.resolve_tokenizer_init(
-            tokenizer_name,
-            *args,
-            revision=revision,
-            runner_type=runner_type,
-            tokenizer_mode=tokenizer_mode,
-            **kwargs,
-        )
-
     if "truncation_side" not in kwargs:
         if runner_type == "generate" or runner_type == "draft":
             kwargs["truncation_side"] = "left"

From 6332347efc85eccbc50834a26b37d2f76c72f04b Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 12 Apr 2026 14:59:35 +0800
Subject: [PATCH 04/21] remove gguf materialization

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../layers/vocab_parallel_embedding.py        | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 24c02a49448f..955f2ae1d30a 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -6,7 +6,7 @@
 
 import torch
 import torch.nn.functional as F
-from torch.nn.parameter import Parameter, UninitializedParameter
+from torch.nn.parameter import Parameter
 
 from vllm.distributed import (
     divide,
@@ -418,16 +418,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         output_dim = getattr(param, "output_dim", None)
         packed_dim = getattr(param, "packed_dim", None)
 
-        if getattr(param, "needs_custom_weight_type", None):
-            param.data.copy_(loaded_weight)
-            param.weight_type = loaded_weight.item()
-            return
-        elif isinstance(param, UninitializedParameter):
-            shape = list(loaded_weight.shape)
-            if output_dim is not None:
-                shape[output_dim] = self.num_embeddings_per_partition
-            param.materialize(tuple(shape), dtype=loaded_weight.dtype)
-
         # If parameter does not have output dim, then it should
         # be copied onto all gpus (e.g. g_idx for act_order gptq).
         if output_dim is None:
@@ -548,11 +538,8 @@ def __init__(
 
     def tie_weights(self, embed_tokens: VocabParallelEmbedding):
         """Tie the weights with word embeddings."""
-        if self.quant_config and self.quant_config.should_keep_tied_lm_head():
-            return embed_tokens
-        else:
-            self.weight = embed_tokens.weight
-            return self
+        self.weight = embed_tokens.weight
+        return self
 
     def forward(self, input_):
         del input_

From a270d85f7e1557f3ccb04780e6728154a3e15909 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 13 Apr 2026 01:14:27 +0800
Subject: [PATCH 05/21] clean

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/config/model.py                 |  13 +--
 vllm/engine/arg_utils.py             |   4 -
 vllm/model_format.py                 | 162 ---------------------------
 vllm/transformers_utils/config.py    |  67 ++---------
 vllm/transformers_utils/processor.py |  22 +---
 5 files changed, 16 insertions(+), 252 deletions(-)
 delete mode 100644 vllm/model_format.py

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 054f14a26fef..1a47f4e7266b 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -25,7 +25,6 @@
 from vllm.config.scheduler import RunnerType
 from vllm.config.utils import config, getattr_iter
 from vllm.logger import init_logger
-from vllm.model_format import get_model_format_handler
 from vllm.platforms import current_platform
 from vllm.tasks import PoolingTask, ScoreType, SupportedTask
 from vllm.transformers_utils.config import (
@@ -498,9 +497,6 @@ def __post_init__(
             hf_overrides_fn=hf_overrides_fn,
             token=self.hf_token,
         )
-        if handler := get_model_format_handler(self.model):
-            hf_config = handler.patch_model_hf_config(self.model, hf_config)
-
         self.hf_config = hf_config
         if dict_overrides:
             self._apply_dict_overrides(hf_config, dict_overrides)
@@ -658,10 +654,6 @@ def __post_init__(
                     "disable the cache with --mm-processor-cache-gb 0."
                 )
 
-        # Multimodal GGUF models must use original repo for mm processing
-        if handler := get_model_format_handler(self.model):
-            handler.validate_model_config(self)
-
         if self.disable_sliding_window:
             # Set after get_and_verify_max_len to ensure that max_model_len
             # can be correctly capped to sliding window size
@@ -814,10 +806,7 @@ def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> No
             self.tokenizer = object_storage_tokenizer.dir
 
     def _get_encoder_config(self) -> dict[str, Any] | None:
-        model = self.model
-        if handler := get_model_format_handler(model):
-            model = handler.resolve_sentence_transformer_source(model, self.revision)
-        return get_sentence_transformer_tokenizer_config(model, self.revision)
+        return get_sentence_transformer_tokenizer_config(self.model, self.revision)
 
     def _get_default_runner_type(
         self,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b42904367883..69500ec051cf 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -93,7 +93,6 @@
 from vllm.config.utils import get_field
 from vllm.config.vllm import OptimizationLevel, PerformanceMode
 from vllm.logger import init_logger, suppress_logging
-from vllm.model_format import get_model_format_handler
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
@@ -1418,9 +1417,6 @@ def from_cli_args(cls, args: argparse.Namespace):
     def create_model_config(self) -> ModelConfig:
         load_general_plugins()
 
-        if handler := get_model_format_handler(self.model):
-            handler.update_engine_args(self)
-
         if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
             logger.warning(
                 "The global random seed is set to %d. Since "
diff --git a/vllm/model_format.py b/vllm/model_format.py
deleted file mode 100644
index cb77983a9470..000000000000
--- a/vllm/model_format.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from pathlib import Path
-from typing import TYPE_CHECKING, Any
-
-from vllm.logger import init_logger
-
-if TYPE_CHECKING:
-    from transformers import PretrainedConfig
-
-logger = init_logger(__name__)
-
-
-class ModelFormatHandler:
-    """Extension hook for out-of-tree model formats.
-
-    Handlers can customize how a model reference is interpreted across vLLM,
-    such as model/config discovery, tokenizer and processor resolution, and
-    engine-arg defaults.
-    """
-
-    name: str = ""
-
-    def matches(self, model: str | Path | None) -> bool:
-        return False
-
-    def update_engine_args(self, engine_args: Any) -> None:
-        return
-
-    def prepare_hf_config_load(
-        self,
-        model: str | Path,
-        revision: str | None = None,
-        kwargs: dict[str, Any] | None = None,
-    ) -> tuple[str | Path, dict[str, Any]]:
-        return model, kwargs or {}
-
-    def should_use_hf_config_parser(
-        self,
-        original_model: str | Path,
-        resolved_model: str | Path,
-    ) -> bool:
-        return False
-
-    def get_missing_hf_config_error(
-        self,
-        original_model: str | Path,
-        resolved_model: str | Path,
-    ) -> str | None:
-        return None
-
-    def patch_parsed_hf_config(
-        self,
-        original_model: str | Path,
-        config_dict: dict[str, Any],
-        config: "PretrainedConfig",
-    ) -> "PretrainedConfig":
-        return config
-
-    def patch_model_hf_config(
-        self,
-        original_model: str | Path,
-        hf_config: "PretrainedConfig",
-    ) -> "PretrainedConfig":
-        return hf_config
-
-    def resolve_tokenizer_init(
-        self,
-        tokenizer_name: str | Path,
-        *args: Any,
-        revision: str | None = None,
-        runner_type: str = "generate",
-        tokenizer_mode: str = "auto",
-        **kwargs: Any,
-    ) -> tuple[str | Path, tuple[Any, ...], dict[str, Any]]:
-        return tokenizer_name, args, kwargs
-
-    def resolve_processor_source(
-        self,
-        model_config: Any,
-        component: str,
-    ) -> tuple[str | Path, str | None]:
-        return model_config.model, model_config.revision
-
-    def validate_model_config(self, model_config: Any) -> None:
-        return
-
-    def resolve_sentence_transformer_source(
-        self,
-        model: str | Path,
-        revision: str | None = None,
-    ) -> str | Path:
-        return model
-
-    def resolve_image_processor_source(
-        self,
-        model: str | Path,
-        revision: str | None = None,
-    ) -> str | Path:
-        return model
-
-    def should_skip_generation_config(self, model: str | Path) -> bool:
-        return False
-
-
-_MODEL_FORMAT_HANDLERS: list[ModelFormatHandler] = []
-
-
-def register_model_format(handler: ModelFormatHandler) -> ModelFormatHandler:
-    if not isinstance(handler, ModelFormatHandler):
-        raise ValueError("The model format handler must subclass `ModelFormatHandler`.")
-
-    replaced = False
-    if handler.name:
-        for idx, existing in enumerate(_MODEL_FORMAT_HANDLERS):
-            if existing.name == handler.name:
-                logger.warning(
-                    "The model format handler %r already exists and will be "
-                    "overwritten by %s.",
-                    handler.name,
-                    type(handler),
-                )
-                _MODEL_FORMAT_HANDLERS[idx] = handler
-                replaced = True
-                break
-
-    if not replaced:
-        _MODEL_FORMAT_HANDLERS.append(handler)
-
-    return handler
-
-
-def get_model_format_handler(model: str | Path | None) -> ModelFormatHandler | None:
-    for handler in reversed(_MODEL_FORMAT_HANDLERS):
-        if handler.matches(model):
-            return handler
-    return None
-
-
-def prepare_hf_model_reference(
-    model: str | Path,
-    revision: str | None = None,
-    **kwargs: Any,
-) -> tuple[ModelFormatHandler | None, str | Path, dict[str, Any]]:
-    handler = get_model_format_handler(model)
-    if handler is None:
-        return None, model, kwargs
-    resolved_model, resolved_kwargs = handler.prepare_hf_config_load(
-        model,
-        revision=revision,
-        kwargs=kwargs,
-    )
-    return handler, resolved_model, resolved_kwargs
-
-
-__all__ = [
-    "ModelFormatHandler",
-    "get_model_format_handler",
-    "prepare_hf_model_reference",
-    "register_model_format",
-]
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index fa8beacfebcf..3a6fb93fa1f9 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -25,10 +25,6 @@
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.model_format import (
-    get_model_format_handler,
-    prepare_hf_model_reference,
-)
 from vllm.transformers_utils.repo_utils import is_mistral_model_repo
 from vllm.transformers_utils.utils import (
     parse_safetensors_file_metadata,
@@ -571,17 +567,15 @@ def maybe_override_with_speculators(
         Tuple of (resolved_model, resolved_tokenizer, speculative_config)
     """
     kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
-    _, resolved_model, resolved_kwargs = prepare_hf_model_reference(
-        model,
-        revision=revision,
-        **kwargs,
-    )
-    config_dict, _ = PretrainedConfig.get_config_dict(
-        resolved_model,
-        revision=revision,
-        token=hf_token,
-        **without_trust_remote_code(resolved_kwargs),
-    )
+    try:
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model,
+            revision=revision,
+            token=hf_token,
+            **without_trust_remote_code(kwargs),
+        )
+    except OSError:
+        config_dict = {}
     speculators_config = config_dict.get("speculators_config")
 
     if speculators_config is None:
@@ -615,20 +609,8 @@ def get_config(
     hf_overrides_fn: Callable[[PretrainedConfig], PretrainedConfig] | None = None,
     **kwargs,
 ) -> PretrainedConfig:
-    original_model = model
-    handler, model, kwargs = prepare_hf_model_reference(
-        model,
-        revision=revision,
-        **kwargs,
-    )
-
     if config_format == "auto":
         try:
-            use_hf_parser_without_config = (
-                handler.should_use_hf_config_parser(original_model, model)
-                if handler is not None
-                else False
-            )
             # First check for Mistral to avoid defaulting to
             # Transformers implementation.
             if is_mistral_model_repo(
@@ -637,22 +619,9 @@ def get_config(
                 model=model, config_name=MISTRAL_CONFIG_NAME, revision=revision
             ):
                 config_format = "mistral"
-            elif use_hf_parser_without_config or file_or_path_exists(
-                model, HF_CONFIG_NAME, revision=revision
-            ):
+            elif file_or_path_exists(model, HF_CONFIG_NAME, revision=revision):
                 config_format = "hf"
             else:
-                if (
-                    handler is not None
-                    and (
-                        err_msg := handler.get_missing_hf_config_error(
-                            original_model, model
-                        )
-                    )
-                    is not None
-                ):
-                    logger.error(err_msg)
-                    raise ValueError(err_msg)
                 raise ValueError(
                     "Could not detect config format for no config file found. "
                     "With config_format 'auto', ensure your model has either "
@@ -672,7 +641,7 @@ def get_config(
                 "'config.json'.\n"
                 "   - For Mistral models: ensure the presence of a "
                 "'params.json'.\n"
-            ).format(model=original_model)
+            ).format(model=model)
 
             raise ValueError(error_message) from e
 
@@ -685,8 +654,6 @@ def get_config(
         hf_overrides=hf_overrides_kw or hf_overrides_fn,
         **kwargs,
     )
-    if handler is not None:
-        config = handler.patch_parsed_hf_config(original_model, config_dict, config)
 
     # Architecture mapping for models without explicit architectures field
     if not config.architectures:
@@ -779,9 +746,6 @@ def get_pooling_config(
         A dictionary containing the pooling type and whether
             normalization is used, or None if no pooling configuration is found.
     """
-    if handler := get_model_format_handler(model):
-        model = handler.resolve_sentence_transformer_source(model, revision)
-
     modules_file_name = "modules.json"
 
     modules_dict = None
@@ -869,9 +833,6 @@ def get_sentence_transformer_tokenizer_config(
     - dict: A dictionary containing the configuration parameters
     for the Sentence Transformer BERT model.
     """
-    if handler := get_model_format_handler(model):
-        model = handler.resolve_sentence_transformer_source(model, revision)
-
     sentence_transformer_config_files = [
         "sentence_bert_config.json",
         "sentence_roberta_config.json",
@@ -1000,8 +961,6 @@ def get_hf_image_processor_config(
     # ModelScope does not provide an interface for image_processor
     if envs.VLLM_USE_MODELSCOPE:
         return dict()
-    if handler := get_model_format_handler(model):
-        model = handler.resolve_image_processor_source(model, revision)
     return get_image_processor_config(
         model, token=hf_token, revision=revision, **kwargs
     )
@@ -1031,10 +990,6 @@ def try_get_generation_config(
     config_format: str | ConfigFormat = "auto",
     hf_token: bool | str | None = None,
 ) -> GenerationConfig | None:
-    if handler := get_model_format_handler(model):
-        if handler.should_skip_generation_config(model):
-            return None
-
     try:
         return GenerationConfig.from_pretrained(
             model,
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index ed9d764cdc43..bf432fe8f1e0 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -24,7 +24,6 @@
 from typing_extensions import TypeVar
 
 from vllm.logger import init_logger
-from vllm.model_format import get_model_format_handler
 from vllm.transformers_utils import processors
 from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
 from vllm.transformers_utils.utils import convert_model_repo_to_path
@@ -341,15 +340,9 @@ def cached_processor_from_config(
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
-    if handler := get_model_format_handler(model_config.model):
-        model, revision = handler.resolve_processor_source(model_config, "processor")
-    else:
-        model = model_config.model
-        revision = model_config.revision
-
     return cached_get_processor_without_dynamic_kwargs(
-        model,
-        revision=revision,
+        model_config.model,
+        revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
         processor_cls=processor_cls,  # type: ignore[arg-type]
         **_merge_mm_kwargs(model_config, processor_cls, **kwargs),
@@ -450,16 +443,9 @@ def cached_image_processor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    if handler := get_model_format_handler(model_config.model):
-        model, revision = handler.resolve_processor_source(
-            model_config, "image_processor"
-        )
-    else:
-        model = model_config.model
-        revision = model_config.revision
     return cached_get_image_processor(
-        model,
-        revision=revision,
+        model_config.model,
+        revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
         **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
     )

From 4d940cb9791c556b3385ce1ac4b85a568d5ea074 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 7 May 2026 22:25:37 +0800
Subject: [PATCH 06/21] workaround tie words embedding

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/vocab_parallel_embedding.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index ae8243d314bd..3321010ac6a9 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -77,6 +77,12 @@ def apply(
     def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor:
         return F.embedding(input_, layer.weight)
 
+    def tie_weights(
+        self, layer: torch.nn.Module, embed_tokens: "VocabParallelEmbedding"
+    ):
+        layer.weight = embed_tokens.weight
+        return layer
+
 
 def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
     """Pad the vocab size to the given value."""
@@ -544,8 +550,7 @@ def __init__(
 
     def tie_weights(self, embed_tokens: VocabParallelEmbedding):
         """Tie the weights with word embeddings."""
-        self.weight = embed_tokens.weight
-        return self
+        return self.quant_method.tie_weights(self, embed_tokens)
 
     def forward(self, input_):
         del input_

From 83fe14d0ec812a2b2851dfa7f51008f9bb4fb476 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 8 May 2026 23:25:19 +0800
Subject: [PATCH 07/21] remove siglip maybe_swap_ffn_param

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/siglip.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index c50d737878f5..bc6302068a73 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -952,27 +952,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 break
             else:
                 param = params_dict[name]
-                param = maybe_swap_ffn_param(
-                    name, param, loaded_weight, params_dict, self.quant_config
-                )
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
-def maybe_swap_ffn_param(
-    name: str,
-    param: torch.Tensor,
-    loaded_weight: torch.Tensor,
-    params_dict: dict[str, torch.Tensor],
-    quant_config: QuantizationConfig,
-) -> torch.Tensor:
-    if quant_config is None or ".fc" not in name:
-        return param
-    return quant_config.remap_loaded_parameter(name, param, loaded_weight, params_dict)
-
-
 # Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200
 class SiglipTextEmbeddings(nn.Module):
     def __init__(self, config: SiglipTextConfig):

From df565fa298414fb737e6f9024634ee1293d591f1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 May 2026 14:57:32 +0800
Subject: [PATCH 08/21] pass quant_config

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/olmoe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 1f342ad1733d..5b661aa4e4de 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -279,12 +279,14 @@ def __init__(
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
         self.config = config
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,

From ca76814652a4f0c50d0956401c9c0312acd658ab Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 May 2026 15:23:44 +0800
Subject: [PATCH 09/21] clean

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/apertus.py   | 4 ----
 vllm/model_executor/models/exaone.py    | 4 ----
 vllm/model_executor/models/exaone4.py   | 4 ----
 vllm/model_executor/models/jais2.py     | 5 -----
 vllm/model_executor/models/llama.py     | 4 ----
 vllm/model_executor/models/llama4.py    | 4 ----
 vllm/model_executor/models/openpangu.py | 5 -----
 7 files changed, 30 deletions(-)

diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 7254f41fd2f0..dcea424200d0 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -228,10 +228,6 @@ def _init_rotary_emb(
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
-        if quant_config is not None:
-            override = quant_config.override_is_neox_style(config.model_type)
-            if override is not None:
-                is_neox_style = override
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 4a56aa676073..f80a5b34c14f 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -162,10 +162,6 @@ def __init__(
         )
 
         is_neox_style = True
-        if quant_config is not None:
-            override = quant_config.override_is_neox_style(config.model_type)
-            if override is not None:
-                is_neox_style = override
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 51b317a51229..81bd79ca8a7e 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -168,10 +168,6 @@ def __init__(
         self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
         is_neox_style = True
-        if quant_config is not None:
-            override = quant_config.override_is_neox_style(config.model_type)
-            if override is not None:
-                is_neox_style = override
 
         layer_idx = extract_layer_index(prefix)
         is_sliding = config.layer_types[layer_idx] == "sliding_attention"
diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py
index 0f3ec5315847..f4303c4010e4 100644
--- a/vllm/model_executor/models/jais2.py
+++ b/vllm/model_executor/models/jais2.py
@@ -161,11 +161,6 @@ def __init__(
         )
 
         is_neox_style = True
-        if quant_config is not None:
-            override = quant_config.override_is_neox_style(config.model_type)
-            if override is not None:
-                is_neox_style = override
-
         self.rotary_emb = get_rope(
             self.head_dim,
             max_position=max_position_embeddings,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 93607c967af4..ef704e8c0d95 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -238,10 +238,6 @@ def _init_rotary_emb(
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
-        if quant_config is not None:
-            override = quant_config.override_is_neox_style(config.model_type)
-            if override is not None:
-                is_neox_style = override
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 349ede83f1a3..fc4e95be0406 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -237,10 +237,6 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
         is_neox_style = True
-        if quant_config is not None:
-            override = quant_config.override_is_neox_style(config.model_type)
-            if override is not None:
-                is_neox_style = override
 
         self.rotary_emb = (
             get_rope(
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index 29688b5af228..783655a08d97 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -517,11 +517,6 @@ def _init_rotary_emb(
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
-        if quant_config is not None:
-            override = quant_config.override_is_neox_style(config.model_type)
-            if override is not None:
-                is_neox_style = override
-
         rope_parameters = config.rope_parameters or {}
         if rope_parameters is not None and rope_parameters.get(
             "mrope_interleaved", False

From 4b1e2d0d8a17081cee5e07ecd8c01a5db3d17481 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 May 2026 15:29:14 +0800
Subject: [PATCH 10/21] clean openpangu

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/openpangu.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index 783655a08d97..a517c52e6902 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -712,22 +712,6 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
         # no need to narrow
         is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
-        needs_custom_weight_materialization = getattr(
-            param, "needs_custom_weight_materialization", False
-        )
-        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
-        if needs_custom_weight_type:
-            param.weight_type = loaded_weight.item()
-
-        if needs_custom_weight_materialization and isinstance(
-            param, nn.UninitializedParameter
-        ):
-            final_shape = list(loaded_weight.shape)
-            if output_dim is not None:
-                assert final_shape[output_dim] % self.tp_size == 0
-                final_shape[output_dim] = final_shape[output_dim] // self.tp_size
-            param.materialize(final_shape, dtype=loaded_weight.dtype)
-
         param_data = param.data
         if output_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[output_dim]

From 070ce481184b6bb12348f38ee6919e6e433b35cc Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 May 2026 15:31:55 +0800
Subject: [PATCH 11/21] clean dead code

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../layers/quantization/base_config.py        | 22 -------------------
 vllm/model_executor/models/gemma3.py          |  5 -----
 2 files changed, 27 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index b870bbd94b39..8f70f3e7a0b2 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -169,28 +169,6 @@ def get_quant_method(
     def get_cache_scale(self, name: str) -> str | None:
         return None
 
-    def override_is_neox_style(self, model_type: str) -> bool | None:
-        return None
-
-    def should_keep_tied_lm_head(self) -> bool:
-        return False
-
-    def transform_loaded_weight(
-        self,
-        name: str,
-        loaded_weight: torch.Tensor,
-    ) -> torch.Tensor:
-        return loaded_weight
-
-    def remap_loaded_parameter(
-        self,
-        name: str,
-        param: torch.Tensor,
-        loaded_weight: torch.Tensor,
-        params_dict: dict[str, torch.Tensor],
-    ) -> torch.Tensor:
-        return param
-
     def apply_vllm_mapper(  # noqa: B027
         self, hf_to_vllm_mapper: "WeightsMapper"
     ):
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 5b3f0688de4d..72392b8f9ece 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -383,11 +383,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if self.quant_config is not None:
-                loaded_weight = self.quant_config.transform_loaded_weight(
-                    name, loaded_weight
-                )
-
             if self.quant_config is not None and (
                 scale_name := self.quant_config.get_cache_scale(name)
             ):

From de99b2bab52eda9d93865be99445a22b422fd39c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 May 2026 17:02:38 +0800
Subject: [PATCH 12/21] clean MoE weight loader

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/fused_moe/layer.py | 23 -------------------
 1 file changed, 23 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index fd54ef8715c5..bbfe5af64277 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -6,7 +6,6 @@
 from typing import Literal, cast, get_args, overload
 
 import torch
-from torch.nn.parameter import UninitializedParameter
 
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, get_current_vllm_config
@@ -1155,15 +1154,6 @@ def weight_loader(
         # dimension intermediate_size_per_partition is used.
         SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
 
-        needs_custom_weight_materialization = getattr(
-            param, "needs_custom_weight_materialization", False
-        )
-        needs_custom_weight_type = getattr(param, "needs_custom_weight_type", False)
-        if needs_custom_weight_type:
-            param.weight_type = loaded_weight.item()
-            param.data.copy_(loaded_weight)
-            return True if return_success else None
-
         # Case for BitsAndBytes
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
         if use_bitsandbytes_4bit:
@@ -1209,19 +1199,6 @@ def weight_loader(
         if full_load:
             shard_dim += 1
 
-        if needs_custom_weight_materialization and isinstance(
-            param, UninitializedParameter
-        ):
-            # To materialize a tensor, we must have full shape including
-            # number of experts, making this portion to require `full_load`.
-            assert full_load
-            final_shape = list(loaded_weight.shape)
-            # w1 and w3 are merged per expert.
-            if shard_id in {"w1", "w3"}:
-                final_shape[1] *= 2
-            final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
-            param.materialize(final_shape, dtype=loaded_weight.dtype)
-
         expert_data = param.data if full_load else param.data[expert_id]
 
         # Case input scale: input_scale loading is only supported for fp8

From 1c74b1c6333b2e04d745a854ffb9932f84f8ba69 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 May 2026 17:12:37 +0800
Subject: [PATCH 13/21] clean rocm test

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 requirements/test/rocm.txt | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt
index b61d74246452..9d95a9e4375b 100644
--- a/requirements/test/rocm.txt
+++ b/requirements/test/rocm.txt
@@ -273,12 +273,6 @@ genai-perf==0.0.16
     # via -r requirements/test/rocm.in
 genson==1.3.0
     # via datamodel-code-generator
-geopandas==1.1.3
-    # via terratorch
-gitdb==4.0.12
-    # via gitpython
-gitpython==3.1.46
-    # via wandb
 google-api-core==2.30.0
     # via
     #   google-cloud-core
@@ -585,8 +579,6 @@ numpy==2.2.6
     #   evaluate
     #   fastparquet
     #   genai-perf
-    #   geopandas
-    #   h5py
     #   imagehash
     #   imageio
     #   librosa

From f38f9bb8a5f26cb13e88bbc104d2f6a5ffed2b8d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 May 2026 23:46:29 +0800
Subject: [PATCH 14/21] clean spec config overrides

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/transformers_utils/config.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 955360dd472c..ecf0638ef2cb 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -602,15 +602,12 @@ def maybe_override_with_speculators(
         Tuple of (resolved_model, resolved_tokenizer, speculative_config)
     """
     kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
-    try:
-        config_dict, _ = PretrainedConfig.get_config_dict(
-            model,
-            revision=revision,
-            token=hf_token,
-            **without_trust_remote_code(kwargs),
-        )
-    except OSError:
-        config_dict = {}
+    config_dict, _ = PretrainedConfig.get_config_dict(
+        model,
+        revision=revision,
+        token=hf_token,
+        **without_trust_remote_code(kwargs),
+    )
     speculators_config = config_dict.get("speculators_config")
 
     if speculators_config is None:

From fdf0b53b5aeb19461e7694c4f11d6b7cbf1ae63b Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 11 May 2026 00:08:45 +0800
Subject: [PATCH 15/21] clean unused config patch

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/quantization/base_config.py | 4 ----
 vllm/model_executor/model_loader/weight_utils.py       | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 8f70f3e7a0b2..344ddd8abd25 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -129,10 +129,6 @@ def override_quantization_method(
         """
         return None
 
-    @classmethod
-    def requires_hf_quant_config(cls) -> bool:
-        return True
-
     @staticmethod
     def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any:
         """Get a value from the model's quantization config."""
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a19ead0a6b69..e0290a2d4496 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -258,9 +258,6 @@ def get_quant_config(
         raise ValueError("Model quantization method is not specified in the config.")
     quant_cls = get_quantization_config(model_config.quantization)
 
-    if not quant_cls.requires_hf_quant_config():
-        return quant_cls.from_config({})
-
     # Read the quantization config from the HF model config, if available.
     hf_quant_config = getattr(model_config.hf_config, "quantization_config", None)
     # some vision model may keep quantization_config in their text_config

From 78f3b92e1ab90ad63fd4c66ef5a2ec0e841ba319 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 11 May 2026 00:22:09 +0800
Subject: [PATCH 16/21] clean unnecessary load_general_plugins

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/engine/arg_utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index fc7df1d5ef11..79a0dbdd3707 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1484,8 +1484,6 @@ def from_cli_args(cls, args: argparse.Namespace):
         return engine_args
 
     def create_model_config(self) -> ModelConfig:
-        load_general_plugins()
-
         if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
             logger.warning(
                 "The global random seed is set to %d. Since "
@@ -1629,7 +1627,6 @@ def create_engine_config(
         NOTE: If VllmConfig is incompatible, we raise an error.
         """
         current_platform.pre_register_and_update()
-        load_general_plugins()
 
         device_config = DeviceConfig(device=cast(Device, current_platform.device_type))
 

From a94499b68f8535b6f72533927bcc64720a8d6e1d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 14 May 2026 02:18:46 +0800
Subject: [PATCH 17/21] make pre-commit happy

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/quantization/base_config.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 344ddd8abd25..ed32699d2fd9 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -47,6 +47,13 @@ def embedding(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
         Expects create_weights to have been called before on the layer."""
         raise NotImplementedError
 
+    # Not required functions
+    def tie_weights(self, layer: torch.nn.Module, *args, **kwargs):
+        """Tie layer's weights for the layer from another layer/tensors.
+
+        Expects create_weights to have been called before on the layer."""
+        raise NotImplementedError
+
     def process_weights_after_loading(self, layer: nn.Module) -> None:
         """Process the weight after loading.
 

From c321339b11ade7c242ee20ced20e3c509a484e82 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 20 May 2026 15:58:50 +0800
Subject: [PATCH 18/21] add GGUF doc back

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/features/quantization/gguf.md | 93 ++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 docs/features/quantization/gguf.md

diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
new file mode 100644
index 000000000000..0aa76d679e15
--- /dev/null
+++ b/docs/features/quantization/gguf.md
@@ -0,0 +1,93 @@
+# GGUF
+
+!!! warning
+    Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
+
+!!! note
+    GGUF support has migrated to OOT [vllm-gguf-plugin](https://github.com/vllm-project/vllm-gguf-plugin). Make sure you have GGUF plugin installed before serving a GGUF model.
+
+Before serving a GGUF model, make sure to install the [vllm-gguf-plugin](https://github.com/vllm-project/vllm-gguf-plugin):
+
+```bash
+uv pip install vllm-gguf-plugin
+```
+
+To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF):
+
+```bash
+# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
+```
+
+You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
+
+```bash
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
+   --tokenizer Qwen/Qwen3-0.6B \
+   --tensor-parallel-size 2
+```
+
+Alternatively, you can download and use a local GGUF file:
+
+```bash
+wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf
+vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B
+```
+
+!!! warning
+    We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
+
+GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path
+
+```bash
+# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
+   --tokenizer Qwen/Qwen3-0.6B \
+   --hf-config-path Qwen/Qwen3-0.6B
+```
+
+You can also use the GGUF model directly through the LLM entrypoint:
+
+??? code
+
+      ```python
+      from vllm import LLM, SamplingParams
+
+      # In this script, we demonstrate how to pass input to the chat method:
+      conversation = [
+         {
+            "role": "system",
+            "content": "You are a helpful assistant",
+         },
+         {
+            "role": "user",
+            "content": "Hello",
+         },
+         {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?",
+         },
+         {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+         },
+      ]
+
+      # Create a sampling params object.
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+      # Create an LLM using repo_id:quant_type format.
+      llm = LLM(
+         model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M",
+         tokenizer="Qwen/Qwen3-0.6B",
+      )
+      # Generate texts from the prompts. The output is a list of RequestOutput objects
+      # that contain the prompt, generated text, and other information.
+      outputs = llm.chat(conversation, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text
+         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```

From 3467f2593d761305234bd40846a9d78279fd7e1e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 20 May 2026 22:22:57 +0800
Subject: [PATCH 19/21] remove gguf kernels again

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../quantization/gguf/gguf_kernel.cu          | 560 -------------
 .../libtorch_stable/quantization/gguf/moe.cuh | 739 ------------------
 .../quantization/gguf/moe_vec.cuh             | 338 --------
 3 files changed, 1637 deletions(-)
 delete mode 100644 csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu
 delete mode 100644 csrc/libtorch_stable/quantization/gguf/moe.cuh
 delete mode 100644 csrc/libtorch_stable/quantization/gguf/moe_vec.cuh

diff --git a/csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu b/csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu
deleted file mode 100644
index 0fdfcafab8c0..000000000000
--- a/csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu
+++ /dev/null
@@ -1,560 +0,0 @@
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include "../../../cuda_compat.h"
-#include "../../dispatch_utils.h"
-#include "../../torch_utils.h"
-
-#include <torch/csrc/stable/ops.h>
-
-// NOTE: These headers are intentionally kept in csrc/quantization/gguf/ (not
-// moved to libtorch_stable) to avoid unnecessary reformatting that would break
-// git rename detection and pollute blame history.
-#include "../../../quantization/gguf/ggml-common.h"
-#include "../../../quantization/gguf/vecdotq.cuh"
-#include "../../../quantization/gguf/dequantize.cuh"
-#include "../../../quantization/gguf/mmvq.cuh"
-#include "../../../quantization/gguf/mmq.cuh"
-#include "moe.cuh"
-#include "moe_vec.cuh"
-
-// Q8 gemv
-template <typename scalar_t>
-static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
-                                     void* __restrict__ vy, const int kx,
-                                     const int kx_padded) {
-  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
-  if (ix >= kx_padded) {
-    return;
-  }
-  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
-  const int i_padded = iy * kx_padded + ix;
-
-  block_q8_1* y = (block_q8_1*)vy;
-
-  const int ib = i_padded / QK8_1;   // block index
-  const int iqs = i_padded % QK8_1;  // quant index
-
-  const float xi = ix < kx ? static_cast<float>(x[iy * kx + ix]) : 0.0f;
-  float amax = fabsf(xi);
-  float sum = xi;
-
-#pragma unroll
-  for (int mask = 16; mask > 0; mask >>= 1) {
-    amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32));
-    sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32);
-  }
-
-  const float d = amax / 127;
-  const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
-
-  y[ib].qs[iqs] = q;
-
-  if (iqs > 0) {
-    return;
-  }
-
-  y[ib].ds.x = __float2half(d);
-  y[ib].ds.y = __float2half(sum);
-}
-
-template <typename scalar_t>
-static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
-                                   const int ky, cudaStream_t stream) {
-  const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
-  const int block_num_x =
-      (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-  constexpr int MAX_BLOCK_SIZE = 65535;
-  for (int off = 0; off < ky; off += MAX_BLOCK_SIZE) {
-    const int num_blocks_y = std::min(ky, off + MAX_BLOCK_SIZE) - off;
-    const dim3 num_blocks(block_num_x, num_blocks_y, 1);
-    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(
-        &x[off * kx], (int32_t*)vy + off * (kx_padded / 32 * 9), kx, kx_padded);
-  }
-}
-
-torch::stable::Tensor ggml_dequantize(
-    torch::stable::Tensor W,  // quant weight
-    int64_t type, int64_t m, int64_t n,
-    std::optional<torch::headeronly::ScalarType> const& dtype) {
-  const torch::stable::accelerator::DeviceGuard device_guard(
-      W.get_device_index());
-  auto dtype_ = dtype.value_or(torch::headeronly::ScalarType::Half);
-  auto DW = torch::stable::empty({m, n}, dtype_, std::nullopt, W.device());
-  cudaStream_t stream = get_current_cuda_stream();
-
-  VLLM_STABLE_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
-    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
-    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
-  });
-
-  return DW;
-}
-
-torch::stable::Tensor ggml_mul_mat_vec_a8(
-    torch::stable::Tensor W,  // quant weight
-    torch::stable::Tensor X,  // input
-    int64_t type, int64_t row) {
-  int col = X.sizes()[1];
-  int vecs = X.sizes()[0];
-  const int padded = (col + 512 - 1) / 512 * 512;
-  const torch::stable::accelerator::DeviceGuard device_guard(
-      X.get_device_index());
-  auto Y = torch::stable::empty({vecs, row}, X.scalar_type(), std::nullopt,
-                                W.device());
-  cudaStream_t stream = get_current_cuda_stream();
-  auto quant_X = torch::stable::empty({vecs, padded / 32 * 9},
-                                      torch::headeronly::ScalarType::Int,
-                                      std::nullopt, W.device());
-  VLLM_STABLE_DISPATCH_FLOATING_TYPES(
-      X.scalar_type(), "ggml_mul_mat_vec_a8", [&] {
-        quantize_row_q8_1_cuda<scalar_t>((scalar_t*)X.data_ptr(),
-                                         (void*)quant_X.data_ptr(), col, vecs,
-                                         stream);
-        switch (type) {
-          case 2:
-            mul_mat_vec_q4_0_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 3:
-            mul_mat_vec_q4_1_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 6:
-            mul_mat_vec_q5_0_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 7:
-            mul_mat_vec_q5_1_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 8:
-            mul_mat_vec_q8_0_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 10:
-            mul_mat_vec_q2_K_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 11:
-            mul_mat_vec_q3_K_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 12:
-            mul_mat_vec_q4_K_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 13:
-            mul_mat_vec_q5_K_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 14:
-            mul_mat_vec_q6_K_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 16:
-            mul_mat_vec_iq2_xxs_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 17:
-            mul_mat_vec_iq2_xs_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 18:
-            mul_mat_vec_iq3_xxs_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 19:
-            mul_mat_vec_iq1_s_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 20:
-            mul_mat_vec_iq4_nl_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 21:
-            mul_mat_vec_iq3_s_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 22:
-            mul_mat_vec_iq2_s_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 23:
-            mul_mat_vec_iq4_xs_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-          case 29:
-            mul_mat_vec_iq1_m_q8_1_cuda<scalar_t>(
-                (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                (scalar_t*)Y.data_ptr(), col, row, vecs, stream);
-            break;
-        }
-      });
-  return Y;
-}
-
-torch::stable::Tensor ggml_mul_mat_a8(torch::stable::Tensor W,  // quant weight
-                                      torch::stable::Tensor X,  // input
-                                      int64_t type, int64_t row) {
-  int col = X.sizes()[1];
-  int padded = (col + 512 - 1) / 512 * 512;
-  int batch = X.sizes()[0];
-  const torch::stable::accelerator::DeviceGuard device_guard(
-      X.get_device_index());
-  auto Y = torch::stable::empty({batch, row}, X.scalar_type(), std::nullopt,
-                                W.device());
-  cudaStream_t stream = get_current_cuda_stream();
-  auto quant_X = torch::stable::empty({batch, padded / 32 * 9},
-                                      torch::headeronly::ScalarType::Int,
-                                      std::nullopt, W.device());
-  VLLM_STABLE_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_a8", [&] {
-    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
-                           col, batch, stream);
-
-    switch (type) {
-      case 2:
-        ggml_mul_mat_q4_0_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 3:
-        ggml_mul_mat_q4_1_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 6:
-        ggml_mul_mat_q5_0_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 7:
-        ggml_mul_mat_q5_1_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 8:
-        ggml_mul_mat_q8_0_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 10:
-        ggml_mul_mat_q2_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 11:
-        ggml_mul_mat_q3_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 12:
-        ggml_mul_mat_q4_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 13:
-        ggml_mul_mat_q5_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-      case 14:
-        ggml_mul_mat_q6_K_q8_1_cuda(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
-        break;
-    }
-  });
-  return Y;
-}
-
-torch::stable::Tensor ggml_moe_a8(torch::stable::Tensor X,  // input
-                                  torch::stable::Tensor W,  // expert weights
-                                  torch::stable::Tensor sorted_token_ids,
-                                  torch::stable::Tensor expert_ids,
-                                  torch::stable::Tensor num_tokens_post_padded,
-                                  int64_t type, int64_t row, int64_t top_k,
-                                  int64_t tokens) {
-  int col = X.sizes()[1];
-  int padded = (col + 512 - 1) / 512 * 512;
-  const torch::stable::accelerator::DeviceGuard device_guard(
-      X.get_device_index());
-  auto Y = torch::stable::empty({tokens * top_k, row}, X.scalar_type(),
-                                std::nullopt, W.device());
-  cudaStream_t stream = get_current_cuda_stream();
-  auto quant_X = torch::stable::empty({tokens, padded / 32 * 9},
-                                      torch::headeronly::ScalarType::Int,
-                                      std::nullopt, W.device());
-  VLLM_STABLE_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_a8", [&] {
-    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
-                           col, tokens, stream);
-    switch (type) {
-      case 2:
-        ggml_moe_q4_0_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 3:
-        ggml_moe_q4_1_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 6:
-        ggml_moe_q5_0_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 7:
-        ggml_moe_q5_1_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 8:
-        ggml_moe_q8_0_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 10:
-        ggml_moe_q2_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 11:
-        ggml_moe_q3_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 12:
-        ggml_moe_q4_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 13:
-        ggml_moe_q5_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-      case 14:
-        ggml_moe_q6_K_q8_1_cuda(
-            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
-            (int*)expert_ids.data_ptr(),
-            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
-            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
-        break;
-    }
-  });
-  return Y;
-}
-
-torch::stable::Tensor ggml_moe_a8_vec(
-    torch::stable::Tensor X,  // input
-    torch::stable::Tensor W,  // expert weights
-    torch::stable::Tensor topk_ids, int64_t top_k, int64_t type, int64_t row,
-    int64_t tokens) {
-  int col = X.sizes()[1];
-  const int padded = (col + 512 - 1) / 512 * 512;
-  const torch::stable::accelerator::DeviceGuard device_guard(
-      X.get_device_index());
-  auto Y = torch::stable::empty({tokens * top_k, row}, X.scalar_type(),
-                                std::nullopt, W.device());
-  torch::stable::fill_(Y, 0.0);
-  cudaStream_t stream = get_current_cuda_stream();
-  auto quant_X = torch::stable::empty({tokens, padded / 32 * 9},
-                                      torch::headeronly::ScalarType::Int,
-                                      std::nullopt, W.device());
-  VLLM_STABLE_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_vec_a8", [&] {
-    quantize_row_q8_1_cuda<scalar_t>((scalar_t*)X.data_ptr(),
-                                     (void*)quant_X.data_ptr(), col, tokens,
-                                     stream);
-    switch (type) {
-      case 2:
-        moe_vec_q4_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 3:
-        moe_vec_q4_1_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 6:
-        moe_vec_q5_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 7:
-        moe_vec_q5_1_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 8:
-        moe_vec_q8_0_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 10:
-        moe_vec_q2_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 11:
-        moe_vec_q3_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 12:
-        moe_vec_q4_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 13:
-        moe_vec_q5_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 14:
-        moe_vec_q6_K_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 16:
-        moe_vec_iq2_xxs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 17:
-        moe_vec_iq2_xs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 18:
-        moe_vec_iq3_xxs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 19:
-        moe_vec_iq1_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 20:
-        moe_vec_iq4_nl_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 21:
-        moe_vec_iq3_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 22:
-        moe_vec_iq2_s_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 23:
-        moe_vec_iq4_xs_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-      case 29:
-        moe_vec_iq1_m_q8_1_cuda<scalar_t>(
-            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-            (scalar_t*)Y.data_ptr(), (int*)topk_ids.data_ptr(), top_k, tokens,
-            col, row, quant_X.stride(0), stream);
-        break;
-    }
-  });
-  return Y;
-}
-
-int64_t ggml_moe_get_block_size(int64_t type) {
-  switch (type) {
-    case 2:
-      return MOE_X_Q4_0;
-    case 3:
-      return MOE_X_Q4_1;
-    case 6:
-      return MOE_X_Q5_0;
-    case 7:
-      return MOE_X_Q5_1;
-    case 8:
-      return MOE_X_Q8_0;
-    case 10:
-      return MOE_X_Q2_K;
-    case 11:
-      return MOE_X_Q3_K;
-    case 12:
-      return MOE_X_Q4_K;
-    case 13:
-      return MOE_X_Q5_K;
-    case 14:
-      return MOE_X_Q6_K;
-  }
-  return 0;
-}
diff --git a/csrc/libtorch_stable/quantization/gguf/moe.cuh b/csrc/libtorch_stable/quantization/gguf/moe.cuh
deleted file mode 100644
index a2f9f46c8f89..000000000000
--- a/csrc/libtorch_stable/quantization/gguf/moe.cuh
+++ /dev/null
@@ -1,739 +0,0 @@
-#include <cstdint>
-
-/* Adapted from ./csrc/quantization/gguf/mmq.cuh
-   based on ./vllm/model_executor/layers/fused_moe/experts/triton_moe.py */
-template <typename scalar_t, int qk, int qr, int qi, bool need_sum,
-          typename block_q_t, int mmq_x, int mmq_y, int nwarps,
-          allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles,
-          int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
-static __device__ __forceinline__ void moe_q(
-    const void* __restrict__ vx, const void* __restrict__ vy,
-    scalar_t* __restrict__ dst, const int* __restrict__ sorted_token_ids,
-    const int* __restrict__ expert_ids,
-    const int* __restrict__ num_tokens_post_padded, const int exp_stride,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y,
-    const int nrows_dst, const int top_k) {
-  const int blocks_per_row_x = ncols_x / qk;
-  const int blocks_per_col_y = nrows_y / QK8_1;
-  const int blocks_per_warp = WARP_SIZE_GGUF / qi;
-
-  const int ncols_dst = ncols_y * top_k;
-
-  const auto row_dst_0 = blockIdx.x * mmq_y;
-  const int& row_x_0 = row_dst_0;
-
-  const auto col_dst_0 = blockIdx.y * mmq_x;
-
-  int token_offs[mmq_x / nwarps];
-  for (int i = 0; i < mmq_x; i += nwarps) {
-    token_offs[i / nwarps] = sorted_token_ids[col_dst_0 + threadIdx.y + i];
-  }
-
-  const int exp_idx = expert_ids[blockIdx.y];
-  if (exp_idx > 255 || exp_idx < 0) return;
-  if (blockIdx.y * mmq_x > num_tokens_post_padded[0]) return;
-
-  const block_q_t* x = (const block_q_t*)((char*)vx + exp_idx * exp_stride);
-  const block_q8_1* y = (const block_q8_1*)(vy);
-
-  int* tile_x_ql = nullptr;
-  half2* tile_x_dm = nullptr;
-  int* tile_x_qh = nullptr;
-  int* tile_x_sc = nullptr;
-
-  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
-  __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF];
-  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF / QI8_1];
-
-  float sum[mmq_y / WARP_SIZE_GGUF][mmq_x / nwarps] = {{0.0f}};
-
-  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-    load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
-               tile_x_qh, tile_x_sc, threadIdx.y, nrows_x - row_x_0 - 1,
-               threadIdx.x, blocks_per_row_x);
-
-    const int n_per_r = ((qk * blocks_per_warp) / qr);
-#pragma unroll
-    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
-      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
-      const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-      for (int i = 0; i < mmq_x; i += nwarps) {
-        const int col_y_eff = token_offs[i / nwarps] / top_k;
-        const int block_x = ib0 * (qk / QK8_1) + kbxd;
-        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
-          const block_q8_1* by0 = &y[col_y_eff * blocks_per_col_y + block_x];
-          const int index_y =
-              (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
-          tile_y_qs[index_y] =
-              get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
-        }
-      }
-
-      if (threadIdx.x < n_per_r / QK8_1) {
-        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
-        const int col_y_eff = token_offs[threadIdx.y] / top_k;
-        const int block_x =
-            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
-
-        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
-          const half2* dsi_src = &y[col_y_eff * blocks_per_col_y + block_x].ds;
-          half2* dsi_dst =
-              &tile_y_ds[threadIdx.y * (WARP_SIZE_GGUF / QI8_1) + kby];
-
-          if (need_sum) {
-            *dsi_dst = *dsi_src;
-          } else {
-            float* dfi_dst = (float*)dsi_dst;
-            *dfi_dst = __low2float(*dsi_src);
-          }
-        }
-      }
-      __syncthreads();
-
-      // #pragma unroll // unrolling this loop causes too much register pressure
-      for (int k = ir * WARP_SIZE_GGUF / qr; k < (ir + 1) * WARP_SIZE_GGUF / qr;
-           k += vdr) {
-#pragma unroll
-        for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-          for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-            sum[i / WARP_SIZE_GGUF][j / nwarps] +=
-                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs,
-                        tile_y_ds, threadIdx.x + i, threadIdx.y + j, k);
-          }
-        }
-      }
-      __syncthreads();
-    }
-  }
-
-#pragma unroll
-  for (int j = 0; j < mmq_x; j += nwarps) {
-    const int col_dst = token_offs[j / nwarps];
-    if (col_dst >= ncols_dst) {
-      return;
-    }
-
-#pragma unroll
-    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-      const auto row_dst = row_dst_0 + threadIdx.x + i;
-      if (row_dst >= nrows_dst) {
-        continue;
-      }
-      dst[col_dst * nrows_dst + row_dst] = sum[i / WARP_SIZE_GGUF][j / nwarps];
-    }
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q4_0 8
-  #define MOE_Y_Q4_0 128
-  #define NWARPS_Q4_0 8
-#else
-  #define MOE_X_Q4_0 4
-  #define MOE_Y_Q4_0 32
-  #define NWARPS_Q4_0 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
-#endif
-    moe_q4_0(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q4_0;
-  const int mmq_y = MOE_Y_Q4_0;
-  const int nwarps = NWARPS_Q4_0;
-
-  moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q4_0<mmq_y>, load_tiles_q4_0<mmq_y, nwarps, need_check>,
-        VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q4_0_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MOE_X_Q4_0;
-  int mmq_y = MOE_Y_Q4_0;
-  int nwarps = NWARPS_Q4_0;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q4_1 8
-  #define MOE_Y_Q4_1 128
-  #define NWARPS_Q4_1 8
-#else
-  #define MOE_X_Q4_1 4
-  #define MOE_Y_Q4_1 32
-  #define NWARPS_Q4_1 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
-#endif
-    moe_q4_1(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q4_1;
-  const int mmq_y = MOE_Y_Q4_1;
-  const int nwarps = NWARPS_Q4_1;
-
-  moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q4_1<mmq_y>, load_tiles_q4_1<mmq_y, nwarps, need_check>,
-        VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q4_1_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MOE_X_Q4_1;
-  int mmq_y = MOE_Y_Q4_1;
-  int nwarps = NWARPS_Q4_1;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q5_0 8
-  #define MOE_Y_Q5_0 128
-  #define NWARPS_Q5_0 8
-#else
-  #define MOE_X_Q5_0 4
-  #define MOE_Y_Q5_0 32
-  #define NWARPS_Q5_0 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
-#endif
-    moe_q5_0(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q5_0;
-  const int mmq_y = MOE_Y_Q5_0;
-  const int nwarps = NWARPS_Q5_0;
-
-  moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q5_0<mmq_y>, load_tiles_q5_0<mmq_y, nwarps, need_check>,
-        VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q5_0_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q5_0;
-  const int mmq_y = MOE_Y_Q5_0;
-  const int nwarps = NWARPS_Q5_0;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q5_1 8
-  #define MOE_Y_Q5_1 128
-  #define NWARPS_Q5_1 8
-#else
-  #define MOE_X_Q5_1 4
-  #define MOE_Y_Q5_1 32
-  #define NWARPS_Q5_1 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
-#endif
-    moe_q5_1(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q5_1;
-  const int mmq_y = MOE_Y_Q5_1;
-  const int nwarps = NWARPS_Q5_1;
-
-  moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q5_1<mmq_y>, load_tiles_q5_1<mmq_y, nwarps, need_check>,
-        VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q5_1_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q5_1;
-  const int mmq_y = MOE_Y_Q5_1;
-  const int nwarps = NWARPS_Q5_1;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q8_0 8
-  #define MOE_Y_Q8_0 128
-  #define NWARPS_Q8_0 8
-#else
-  #define MOE_X_Q8_0 4
-  #define MOE_Y_Q8_0 32
-  #define NWARPS_Q8_0 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
-#endif
-    moe_q8_0(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q8_0;
-  const int mmq_y = MOE_Y_Q8_0;
-  const int nwarps = NWARPS_Q8_0;
-
-  moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q8_0<mmq_y>, load_tiles_q8_0<mmq_y, nwarps, need_check>,
-        VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q8_0_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q8_0;
-  const int mmq_y = MOE_Y_Q8_0;
-  const int nwarps = NWARPS_Q8_0;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q2_K 8
-  #define MOE_Y_Q2_K 128
-  #define NWARPS_Q2_K 8
-#else
-  #define MOE_X_Q2_K 4
-  #define MOE_Y_Q2_K 32
-  #define NWARPS_Q2_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
-#endif
-    moe_q2_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q2_K;
-  const int mmq_y = MOE_Y_Q2_K;
-  const int nwarps = NWARPS_Q2_K;
-
-  moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q2_K<mmq_y>, load_tiles_q2_K<mmq_y, nwarps, need_check>,
-        VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q2_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q2_K;
-  const int mmq_y = MOE_Y_Q2_K;
-  const int nwarps = NWARPS_Q2_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q3_K 8
-  #define MOE_Y_Q3_K 128
-  #define NWARPS_Q3_K 8
-#else
-  #define MOE_X_Q3_K 4
-  #define MOE_Y_Q3_K 32
-  #define NWARPS_Q3_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
-#endif
-    moe_q3_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-
-  const int mmq_x = MOE_X_Q3_K;
-  const int mmq_y = MOE_Y_Q3_K;
-  const int nwarps = NWARPS_Q3_K;
-
-  moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q3_K<mmq_y>, load_tiles_q3_K<mmq_y, nwarps, need_check>,
-        VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-template <typename scalar_t>
-static void ggml_moe_q3_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q3_K;
-  const int mmq_y = MOE_Y_Q3_K;
-  const int nwarps = NWARPS_Q3_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q4_K 8
-  #define MOE_Y_Q4_K 128
-  #define NWARPS_Q4_K 8
-#else
-  #define MOE_X_Q4_K 4
-  #define MOE_Y_Q4_K 32
-  #define NWARPS_Q4_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
-#endif
-    moe_q4_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q4_K;
-  const int mmq_y = MOE_Y_Q4_K;
-  const int nwarps = NWARPS_Q4_K;
-
-  moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q4_K<mmq_y>, load_tiles_q4_K<mmq_y, nwarps, need_check>,
-        VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q4_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q4_K;
-  const int mmq_y = MOE_Y_Q4_K;
-  const int nwarps = NWARPS_Q4_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q5_K 8
-  #define MOE_Y_Q5_K 128
-  #define NWARPS_Q5_K 8
-#else
-  #define MOE_X_Q5_K 4
-  #define MOE_Y_Q5_K 32
-  #define NWARPS_Q5_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
-#endif
-    moe_q5_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q5_K;
-  const int mmq_y = MOE_Y_Q5_K;
-  const int nwarps = NWARPS_Q5_K;
-
-  moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q5_K<mmq_y>, load_tiles_q5_K<mmq_y, nwarps, need_check>,
-        VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q5_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q5_K;
-  const int mmq_y = MOE_Y_Q5_K;
-  const int nwarps = NWARPS_Q5_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
-
-#if defined(USE_ROCM)
-  #define MOE_X_Q6_K 8
-  #define MOE_Y_Q6_K 128
-  #define NWARPS_Q6_K 8
-#else
-  #define MOE_X_Q6_K 4
-  #define MOE_Y_Q6_K 32
-  #define NWARPS_Q6_K 4
-#endif
-
-template <typename scalar_t, bool need_check>
-static __global__ void
-#if defined(USE_ROCM)
-__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
-#endif
-    moe_q6_K(const void* __restrict__ vx, const void* __restrict__ vy,
-             scalar_t* __restrict__ dst, const int* sorted_token_ids,
-             const int* expert_ids, const int* num_tokens_post_padded,
-             const int exp_stride, const int ncols_x, const int nrows_x,
-             const int ncols_y, const int nrows_y, const int nrows_dst,
-             const int top_k) {
-  const int mmq_x = MOE_X_Q6_K;
-  const int mmq_y = MOE_Y_Q6_K;
-  const int nwarps = NWARPS_Q6_K;
-
-  moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-        allocate_tiles_q6_K<mmq_y>, load_tiles_q6_K<mmq_y, nwarps, need_check>,
-        VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>(
-      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-}
-
-template <typename scalar_t>
-static void ggml_moe_q6_K_q8_1_cuda(
-    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
-    const int* expert_ids, const int* num_tokens_post_padded,
-    const int exp_stride, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
-    const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MOE_X_Q6_K;
-  const int mmq_y = MOE_Y_Q6_K;
-  const int nwarps = NWARPS_Q6_K;
-
-  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-  const int block_num_y = (tokens_post_padded) / mmq_x;
-  const dim3 block_nums(block_num_x, block_num_y, 1);
-  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
-
-  if (nrows_x % mmq_y == 0) {
-    constexpr bool need_check = false;
-    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  } else {
-    constexpr bool need_check = true;
-    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
-        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
-        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
-  }
-}
diff --git a/csrc/libtorch_stable/quantization/gguf/moe_vec.cuh b/csrc/libtorch_stable/quantization/gguf/moe_vec.cuh
deleted file mode 100644
index 60f65a1bfdcb..000000000000
--- a/csrc/libtorch_stable/quantization/gguf/moe_vec.cuh
+++ /dev/null
@@ -1,338 +0,0 @@
-// copied and adapted from
-// https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
-template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr,
-          vec_dot_q_cuda_t vec_dot_q_cuda>
-static __global__ void moe_vec_q(const void* __restrict__ vx,
-                                 const void* __restrict__ vy,
-                                 scalar_t* __restrict__ dst,
-                                 const int* topk_ids, const int topk,
-                                 const int ncols, const int nrows,
-                                 const int token_stride) {
-  const auto row = blockIdx.x * blockDim.y + threadIdx.y;
-
-  const auto token = blockIdx.z / topk;
-  const auto expert = (topk_ids)[blockIdx.z];
-
-  if (row >= nrows) {
-    return;
-  }
-
-  const int blocks_per_row = ncols / qk;
-  const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
-  // partial sum for each thread
-  float tmp = 0.0f;
-
-  const block_q_t* x = ((const block_q_t*)vx) + expert * nrows * blocks_per_row;
-  const block_q8_1* y =
-      (const block_q8_1*)(((const int*)vy) + token * token_stride);
-
-  for (auto i = threadIdx.x / (qi / vdr); i < blocks_per_row;
-       i += blocks_per_warp) {
-    const int ibx = row * blocks_per_row + i;  // x block index
-
-    const int iby = i * (qk / QK8_1);  // y block index that aligns with ibx
-
-    const int iqs =
-        vdr *
-        (threadIdx.x %
-         (qi / vdr));  // x block quant index when casting the quants to int
-
-    tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
-  }
-
-  // sum up partial sums and write back result
-#pragma unroll
-  for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
-    tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
-  }
-
-  if (threadIdx.x == 0) {
-    dst[blockIdx.z * nrows + row] = tmp;
-  }
-}
-
-template <typename scalar_t>
-static void moe_vec_q4_0_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
-            vec_dot_q4_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q4_1_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
-            vec_dot_q4_1_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q5_0_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
-            vec_dot_q5_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q5_1_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
-            vec_dot_q5_1_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q8_0_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
-            vec_dot_q8_0_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q2_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
-            vec_dot_q2_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q3_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
-            vec_dot_q3_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q4_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
-            vec_dot_q4_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q5_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
-            vec_dot_q5_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_q6_K_q8_1_cuda(const void* vx, const void* vy,
-                                   scalar_t* dst, const int* topk_ids,
-                                   const int top_k, const int tokens,
-                                   const int ncols, const int nrows,
-                                   const int token_stride,
-                                   cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
-            vec_dot_q6_K_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq2_xxs_q8_1_cuda(const void* vx, const void* vy,
-                                      scalar_t* dst, const int* topk_ids,
-                                      const int top_k, const int tokens,
-                                      const int ncols, const int nrows,
-                                      const int token_stride,
-                                      cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq2_xs_q8_1_cuda(const void* vx, const void* vy,
-                                     scalar_t* dst, const int* topk_ids,
-                                     const int top_k, const int tokens,
-                                     const int ncols, const int nrows,
-                                     const int token_stride,
-                                     cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq2_s_q8_1_cuda(const void* vx, const void* vy,
-                                    scalar_t* dst, const int* topk_ids,
-                                    const int top_k, const int tokens,
-                                    const int ncols, const int nrows,
-                                    const int token_stride,
-                                    cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq3_xxs_q8_1_cuda(const void* vx, const void* vy,
-                                      scalar_t* dst, const int* topk_ids,
-                                      const int top_k, const int tokens,
-                                      const int ncols, const int nrows,
-                                      const int token_stride,
-                                      cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq1_s_q8_1_cuda(const void* vx, const void* vy,
-                                    scalar_t* dst, const int* topk_ids,
-                                    const int top_k, const int tokens,
-                                    const int ncols, const int nrows,
-                                    const int token_stride,
-                                    cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq1_m_q8_1_cuda(const void* vx, const void* vy,
-                                    scalar_t* dst, const int* topk_ids,
-                                    const int top_k, const int tokens,
-                                    const int ncols, const int nrows,
-                                    const int token_stride,
-                                    cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq4_nl_q8_1_cuda(const void* vx, const void* vy,
-                                     scalar_t* dst, const int* topk_ids,
-                                     const int top_k, const int tokens,
-                                     const int ncols, const int nrows,
-                                     const int token_stride,
-                                     cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ,
-            vec_dot_iq4_nl_q8_1><<<block_nums, block_dims, 0, stream>>>(
-      vx, vy, dst, topk_ids, top_k, ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq4_xs_q8_1_cuda(const void* vx, const void* vy,
-                                     scalar_t* dst, const int* topk_ids,
-                                     const int top_k, const int tokens,
-                                     const int ncols, const int nrows,
-                                     const int token_stride,
-                                     cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}
-
-template <typename scalar_t>
-static void moe_vec_iq3_s_q8_1_cuda(const void* vx, const void* vy,
-                                    scalar_t* dst, const int* topk_ids,
-                                    const int top_k, const int tokens,
-                                    const int ncols, const int nrows,
-                                    const int token_stride,
-                                    cudaStream_t stream) {
-  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-  const dim3 block_nums(block_num_y, 1, tokens * top_k);
-  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-  moe_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
-      <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, topk_ids, top_k,
-                                              ncols, nrows, token_stride);
-}

From ae0324c64b94bb723a628c2af242ad04ccc6adee Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 20 May 2026 22:25:57 +0800
Subject: [PATCH 20/21] remove ggml bindings

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 csrc/libtorch_stable/ops.h              | 29 -------------------
 csrc/libtorch_stable/torch_bindings.cpp | 38 -------------------------
 2 files changed, 67 deletions(-)

diff --git a/csrc/libtorch_stable/ops.h b/csrc/libtorch_stable/ops.h
index 5ebcb2034f53..006a5af3dd81 100644
--- a/csrc/libtorch_stable/ops.h
+++ b/csrc/libtorch_stable/ops.h
@@ -220,32 +220,3 @@ torch::stable::Tensor gptq_gemm(torch::stable::Tensor a,
 
 void gptq_shuffle(torch::stable::Tensor q_weight, torch::stable::Tensor q_perm,
                   int64_t bit);
-
-// GGML kernels (shared CUDA/ROCm)
-torch::stable::Tensor ggml_dequantize(
-    torch::stable::Tensor W, int64_t type, int64_t m, int64_t n,
-    std::optional<torch::headeronly::ScalarType> const& dtype);
-
-torch::stable::Tensor ggml_mul_mat_vec_a8(torch::stable::Tensor W,
-                                          torch::stable::Tensor X, int64_t type,
-                                          int64_t row);
-
-torch::stable::Tensor ggml_mul_mat_a8(torch::stable::Tensor W,
-                                      torch::stable::Tensor X, int64_t type,
-                                      int64_t row);
-
-torch::stable::Tensor ggml_moe_a8(torch::stable::Tensor X,
-                                  torch::stable::Tensor W,
-                                  torch::stable::Tensor sorted_token_ids,
-                                  torch::stable::Tensor expert_ids,
-                                  torch::stable::Tensor num_tokens_post_padded,
-                                  int64_t type, int64_t row, int64_t top_k,
-                                  int64_t tokens);
-
-torch::stable::Tensor ggml_moe_a8_vec(torch::stable::Tensor X,
-                                      torch::stable::Tensor W,
-                                      torch::stable::Tensor topk_ids,
-                                      int64_t top_k, int64_t type, int64_t row,
-                                      int64_t tokens);
-
-int64_t ggml_moe_get_block_size(int64_t type);
diff --git a/csrc/libtorch_stable/torch_bindings.cpp b/csrc/libtorch_stable/torch_bindings.cpp
index ee0af3da560c..2fd2060dcebc 100644
--- a/csrc/libtorch_stable/torch_bindings.cpp
+++ b/csrc/libtorch_stable/torch_bindings.cpp
@@ -341,34 +341,6 @@ STABLE_TORCH_LIBRARY_FRAGMENT(_C, ops) {
 
   // Post processing for GPTQ.
   ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
-
-  // Dequantization for GGML.
-  ops.def(
-      "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
-      "dtype) -> Tensor");
-
-  // mmvq kernel for GGML.
-  ops.def(
-      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) "
-      "-> Tensor");
-
-  // mmq kernel for GGML.
-  ops.def(
-      "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
-
-  // moe kernel for GGML.
-  ops.def(
-      "ggml_moe_a8(Tensor X, Tensor W, "
-      "Tensor sorted_token_ids, Tensor expert_ids, Tensor "
-      "num_tokens_post_padded, "
-      "int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor");
-
-  ops.def(
-      "ggml_moe_a8_vec(Tensor X, Tensor W, "
-      "Tensor topk_ids, int top_k, "
-      "int type, SymInt row, SymInt tokens) -> Tensor");
-
-  ops.def("ggml_moe_get_block_size(int type) -> int");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, ops) {
@@ -441,13 +413,6 @@ STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, ops) {
   // GPTQ kernels
   ops.impl("gptq_gemm", TORCH_BOX(&gptq_gemm));
   ops.impl("gptq_shuffle", TORCH_BOX(&gptq_shuffle));
-
-  // GGML kernels
-  ops.impl("ggml_dequantize", TORCH_BOX(&ggml_dequantize));
-  ops.impl("ggml_mul_mat_vec_a8", TORCH_BOX(&ggml_mul_mat_vec_a8));
-  ops.impl("ggml_mul_mat_a8", TORCH_BOX(&ggml_mul_mat_a8));
-  ops.impl("ggml_moe_a8", TORCH_BOX(&ggml_moe_a8));
-  ops.impl("ggml_moe_a8_vec", TORCH_BOX(&ggml_moe_a8_vec));
 }
 
 // These capability-check functions take only primitive args (no tensors), so
@@ -465,9 +430,6 @@ STABLE_TORCH_LIBRARY_IMPL(_C, CompositeExplicitAutograd, ops) {
   ops.impl("cutlass_scaled_mm_supports_fp4",
            TORCH_BOX(&cutlass_scaled_mm_supports_fp4));
 #endif
-
-  // GGML block size lookup (no tensor args)
-  ops.impl("ggml_moe_get_block_size", TORCH_BOX(&ggml_moe_get_block_size));
 }
 
 REGISTER_EXTENSION(_C_stable_libtorch)

From c33cf711a00049ff7cad29a8117234f2aec6cb47 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 20 May 2026 23:01:21 +0800
Subject: [PATCH 21/21] fix build

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d5039470d41b..a4c9a7fd3e36 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -632,8 +632,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
     "csrc/libtorch_stable/activation_kernels.cu"
     "csrc/libtorch_stable/quantization/w8a8/int8/scaled_quant.cu"
     "csrc/libtorch_stable/quantization/w8a8/fp8/common.cu"
-    "csrc/libtorch_stable/quantization/gptq/q_gemm.cu"
-    "csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu")
+    "csrc/libtorch_stable/quantization/gptq/q_gemm.cu")
 
   if(VLLM_GPU_LANG STREQUAL "CUDA")
     list(APPEND VLLM_STABLE_EXT_SRC