tetherto
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 60 additions & 3 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 60 additions & 3 deletions
diff --git a/‎ggml/src/ggml-vulkan/ggml-vulkan.cpp‎
Lines changed: 71 additions & 2 deletions b/‎ggml/src/ggml-vulkan/ggml-vulkan.cpp‎
Lines changed: 71 additions & 2 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp‎
Lines changed: 25 additions & 1 deletion b/‎ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp‎
Lines changed: 20 additions & 0 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/dequant_tq2_0.comp‎
Lines changed: 36 additions & 0 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/dequant_tq2_0.comp‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq2_0.comp‎
Lines changed: 66 additions & 0 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq2_0.comp‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp‎
Lines changed: 16 additions & 0 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/out_prod_tq2_0.comp‎
Lines changed: 58 additions & 0 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/out_prod_tq2_0.comp‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/types.comp‎
Lines changed: 16 additions & 0 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/types.comp‎
Lines changed: 16 additions & 0 deletions
@@ -2641,18 +2641,47 @@ def prepare_tensors(self):
         super().prepare_tensors()
 
 
-@ModelBase.register("BitnetForCausalLM")
+@ModelBase.register("BitnetForCausalLM", "BitNetForCausalLM")
 class BitnetModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BITNET
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._bitnet_weight_scales: dict[str, torch.Tensor] = {}
+
     def set_vocab(self):
-        self._set_vocab_sentencepiece()
+        if (self.dir_model / "tokenizer.model").is_file():
+            self._set_vocab_sentencepiece()
+        else:
+            self._set_vocab_gpt2()
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
         self.gguf_writer.add_rope_scaling_factor(1.0)
 
+    @staticmethod
+    def _unpack_bitnet_weights(packed: torch.Tensor) -> torch.Tensor:
+        if packed.dtype != torch.uint8:
+            raise ValueError(f"Expected packed BitNet weights to be torch.uint8, got {packed.dtype}")
+
+        values_per_item = 4
+        rows = packed.shape[0]
+        rest = packed.shape[1:]
+
+        unpacked_chunks: list[torch.Tensor] = []
+        mapping = torch.tensor([-1.0, 0.0, 1.0, 0.0], dtype=torch.float32, device=packed.device)
+
+        for i in range(values_per_item):
+            chunk = (packed >> (2 * i)) & 0x03
+            chunk = mapping[chunk.long()].reshape((rows, *rest))
+            unpacked_chunks.append(chunk)
+
+        if not unpacked_chunks:
+            raise ValueError("Failed to unpack BitNet weights: no chunks produced")
+
+        return torch.cat(unpacked_chunks, dim=0)
+
     def weight_quant(self, weight: Tensor) -> Tensor:
         dtype = weight.dtype
         weight = weight.float()
@@ -2665,8 +2694,36 @@ def weight_quant(self, weight: Tensor) -> Tensor:
         return result.type(dtype)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith(".weight_scale"):
+            weight_name = name[:-13] + ".weight"
+            mapped_weight_name = self.map_tensor_name(weight_name)
+            if isinstance(data_torch, LazyTorchTensor):
+                data_torch = LazyTorchTensor.to_eager(data_torch)
+
+            scale_tensor = data_torch.to(torch.float32)
+            self._bitnet_weight_scales[mapped_weight_name] = scale_tensor
+            return []
+
         new_name = self.map_tensor_name(name)
 
+        ternary_weight = False
+
+        if name.endswith(".weight"):
+            scale_tensor = self._bitnet_weight_scales.pop(new_name, None)
+            if scale_tensor is not None:
+                scale_tensor = scale_tensor.to(torch.float32)
+                if scale_tensor.numel() != 1:
+                    raise ValueError(f"Expected scalar weight_scale for '{name}', got shape {tuple(scale_tensor.shape)}")
+
+                if isinstance(data_torch, LazyTorchTensor):
+                    data_torch = LazyTorchTensor.to_eager(data_torch)
+
+                packed = data_torch.to(torch.uint8)
+                unpacked = self._unpack_bitnet_weights(packed)
+                scale_value = scale_tensor.reshape(-1)[0].item()
+                data_torch = unpacked * scale_value
+                ternary_weight = True
+
         if any(self.match_model_tensor_name(new_name, key, bid) for key in [
             gguf.MODEL_TENSOR.ATTN_Q,
             gguf.MODEL_TENSOR.ATTN_K,
@@ -2675,7 +2732,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             gguf.MODEL_TENSOR.FFN_UP,
             gguf.MODEL_TENSOR.FFN_DOWN,
             gguf.MODEL_TENSOR.FFN_GATE,
-        ]):
+        ]) and not ternary_weight:
             # transform weight into 1/0/-1 (in fp32)
             data_torch = self.weight_quant(data_torch)
 
 
@@ -434,6 +434,30 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 }
 #endif
 
+#if defined(DATA_A_TQ2_0)
+// TQ2_0 ternary dequantization: {0,1,2} -> {-1,0,+1} via (q-1) mapping
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    const uint c0 = (vui >> 0) & 3;
+    const uint c1 = (vui >> 2) & 3;
+    const float q0 = float(c0) - 1.0f;
+    const float q1 = float(c1) - 1.0f;
+    return vec2(q0, q1);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    const uint c0 = (vui >> 0) & 3;
+    const uint c1 = (vui >> 2) & 3;
+    const uint c2 = (vui >> 4) & 3;
+    const uint c3 = (vui >> 6) & 3;
+    const float q0 = float(c0) - 1.0f;
+    const float q1 = float(c1) - 1.0f;
+    const float q2 = float(c2) - 1.0f;
+    const float q3 = float(c3) - 1.0f;
+    return vec4(q0, q1, q2, q3);
+}
+#endif
+
 #if defined(DATA_A_MXFP4)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
@@ -461,7 +485,7 @@ vec2 get_dm(uint ib, uint a_offset) {
 }
 #endif
 
-#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_TQ2_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
 vec2 get_dm(uint ib, uint a_offset) {
     return vec2(float(data_a[a_offset + ib].d), 0);
 }
 
@@ -654,6 +654,24 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
 }
 #endif
 
+#if defined(DATA_A_TQ2_0)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTQ2_0 {
+   block_tq2_0 block;
+};
+
+float16_t dequantFuncTQ2_0(const in decodeBufTQ2_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint byte_idx = ((idx >> 7) << 5) + (idx & 31u);
+    const uint qsshift = (((idx & 127u) >> 5) << 1);
+
+    const uint c = (uint(bl.block.qs[byte_idx]) >> qsshift) & 3u;
+    return d * float16_t(float(c) - 1.0f);
+}
+#endif
+
 #if defined(DATA_A_MXFP4)
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufMXFP4 {
    block_mxfp4 block;
@@ -715,6 +733,8 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords
 #define dequantFuncA dequantFuncIQ4_XS
 #elif defined(DATA_A_IQ4_NL)
 #define dequantFuncA dequantFuncIQ4_NL
+#elif defined(DATA_A_TQ2_0)
+#define dequantFuncA dequantFuncTQ2_0
 #elif defined(DATA_A_MXFP4)
 #define dequantFuncA dequantFuncMXFP4
 #endif
@@ -0,0 +1,36 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+
+#include "types.comp"
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+layout (push_constant) uniform parameter {
+    uint ne;
+} p;
+
+layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint i = gl_GlobalInvocationID.x * 4;
+    
+    if (i >= p.ne) {
+        return;
+    }
+    
+    const uint ib = i / QUANT_K;            // block index
+    const uint iqs = (i % QUANT_K) / 4;     // quant index within block (byte index)
+    const uint bit_pos_base = (i % 4) * 2;  // bit position within byte
+    
+    const float d = float(data_a[ib].d);
+    
+    for (uint j = 0; j < 4 && (i + j) < p.ne; ++j) {
+        const uint local_iqs = ((i + j) % QUANT_K) / 4; // byte index for this element
+        const uint bit_pos = ((i + j) % 4) * 2;         // bit position for this element
+        const uint vui = uint(data_a[ib].qs[local_iqs]);
+        const uint q = (vui >> bit_pos) & 3;
+        data_b[i + j] = D_TYPE(d * (float(q) - 1.0f));
+    }
+}
@@ -0,0 +1,66 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    const uint tid = gl_LocalInvocationID.x;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = tid; i < num_blocks_per_row; i += gl_WorkGroupSize.x) {
+
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib0 = a_offset / QUANT_K + (first_row + n) * num_blocks_per_row;
+            const float d = float(data_a[ib0 + i].d);
+            
+            [[unroll]] for (uint j = 0; j < 64; j += 32) {
+                [[unroll]] for (uint l = 0; l < 4; ++l) {
+                    [[unroll]] for (uint k = 0; k < 32; ++k) {
+                        // Extract quantized value: ((x[i].qs[j + k] >> (l*2)) & 3) - 1
+                        const uint q_byte = uint(data_a[ib0 + i].qs[j + k]);
+                        const uint shift = l * 2;
+                        const uint q = (q_byte >> shift) & 3;
+                        const FLOAT_TYPE dequant_val = FLOAT_TYPE(d * (float(q) - 1.0f));  // CPU kernel: (q-1)*d
+                        
+                        // y-data access pattern: y[i].qs[j*4 + l*32 + k]
+                        const uint b_idx = i * QUANT_K + j * 4 + l * 32 + k;
+                        if (b_idx < p.ncols) {
+                            [[unroll]] for (uint jcol = 0; jcol < NUM_COLS; ++jcol) {
+                                temp[jcol][n] += dequant_val * FLOAT_TYPE(data_b[jcol * p.batch_stride_b + b_offset + b_idx]);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
@@ -450,6 +450,22 @@ void main() {
             buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
             buf_a[buf_idx + 2] = FLOAT_TYPE(v.z);
             buf_a[buf_idx + 3] = FLOAT_TYPE(v.w);
+#elif defined(DATA_A_TQ2_0)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                              // 2 values per idx (like Q2_K)
+            const uint iqs = idx % 128;                             // 0..127
+            const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2;      // Q2_K indexing pattern
+            const uint qsshift = ((iqs % 64) / 16) * 2;             // Q2_K shift: 0,2,4,6
+
+            const float d = float(data_a[ib].d);
+
+            const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
+            const vec2 v = d * (vec2((qs >> qsshift) & 3) - 1.0f);  // (q-1)*d
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
 #elif defined(DATA_A_Q2_K)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
             const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
 
@@ -0,0 +1,58 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+#include "dequant_funcs.comp"
+
+const uint num_threads = 256;
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void get_dst_indices(uint idx, out uint i20, out uint i21, out uint i22, out uint i23) {
+    i23 = fastdiv(idx, (p.ne22*p.ne21*p.ne20));
+    const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20;
+    i22 = fastdiv((idx - i23_offset), (p.ne21*p.ne20));
+    const uint i22_offset = i22*p.ne21*p.ne20;
+    i21 = (idx - i23_offset - i22_offset) / p.ne20;
+    i20 = idx - i23_offset - i22_offset - i21*p.ne20;
+}
+
+void main() {
+    // num_threads * num_iter must equal 512 to match the wg_denoms and get_idx
+    const uint num_iter = 2;
+
+    const uint broadcast2 = uint(p.param2);
+    const uint broadcast3 = p.param3;
+
+    uint idx = get_idx();
+
+    [[unroll]] for (uint it = 0; it < num_iter; ++it) {
+        if (idx < p.ne) {
+            uint i0, i1, i2, i3;
+            get_dst_indices(idx, i0, i1, i2, i3);
+
+            float acc = 0.0f;
+
+            for (uint k = 0; k < p.ne01; k += 1) {
+                const uint a_block_base = get_aoffset() + (i3 / broadcast3) * p.nb03 + (i2 / broadcast2) * p.nb02 + k * p.nb01;
+                const uint ib = a_block_base + (i0 / QUANT_K);
+                const uint r = (i0 % QUANT_K);
+                const uint iqs = (r % 32u) + 32u * (r / 128u);
+                const uint sub = (r % 128u) / 32u;
+
+                const vec4 v = dequantize4(ib, iqs, 0);
+                const vec2 dm = get_dm(ib, 0);
+
+                float qv = (sub == 0u) ? v.x : (sub == 1u) ? v.y : (sub == 2u) ? v.z : v.w;
+                const float a_val = qv * dm.x + dm.y;
+
+                const uint b_idx = src1_idx(i1, k, i2, i3);
+                const float b = data_b[get_boffset() + b_idx];
+                acc += a_val * b;
+            }
+
+            uint d_idx = dst_idx(i0, i1, i2, i3);
+            data_d[get_doffset() + d_idx] = acc;
+        }
+        idx += num_threads;
+    }
+}
@@ -1355,6 +1355,22 @@ struct block_iq4_nl_packed16
 #define A_TYPE_PACKED16 block_iq4_nl_packed16
 #endif
 
+// TQ2_0
+#define QUANT_K_TQ2_0 256
+#define QUANT_R_TQ2_0 4
+
+struct block_tq2_0
+{
+    uint8_t qs[QUANT_K_TQ2_0/QUANT_R_TQ2_0];  // 256/4 = 64 bytes
+    float16_t d;
+};
+
+#if defined(DATA_A_TQ2_0)
+#define QUANT_K QUANT_K_TQ2_0
+#define QUANT_R QUANT_R_TQ2_0
+#define A_TYPE block_tq2_0
+#endif
+
 #define QUANT_K_MXFP4 32
 #define QUANT_R_MXFP4 2