Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml-quants : ternary packing for TriLMs and BitNet b1.58 #8151

Merged
merged 33 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
bd80749
ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
compilade Jun 19, 2024
7ef4254
ggml-quants : faster 1.625 bpw AVX2 vec_dot
compilade Jun 19, 2024
48b73b8
ggml-quants : substract 1 when back in epi8
compilade Jun 19, 2024
ef1e345
ggml-quants : Q2_2 now faster than Q4_K on with AVX2
compilade Jun 20, 2024
638ad52
ggml-quants : cleanup Q1_3 code formatting
compilade Jun 23, 2024
9465ec6
ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
compilade Jun 25, 2024
89dc3b2
ggml-quants : use ceiling division when quantizing q1_3
compilade Jun 26, 2024
961e293
convert-hf : simplify BitNet pre-quantization
compilade Jun 26, 2024
0996149
convert-hf : allow converting the weird BitNet 1.3B
compilade Jun 27, 2024
bfd2f21
bitnet : replace 1.58b with b1.58, as in the paper
compilade Jun 29, 2024
ec50944
ggml-quants : fix build failure on Windows
compilade Jun 29, 2024
8fbd593
ggml-quants : attempt to fix Arm 32-bit support
compilade Jun 29, 2024
dd3e62a
ggml : add some informative comments in q1_3 vec_dot
compilade Jul 29, 2024
79a278e
Merge branch 'master' into compilade/bitnet-ternary
compilade Jul 29, 2024
77b8f84
ggml : add TQ1_0 and TQ2_0 ternary quantization types
compilade Jul 30, 2024
560873f
ggml : even faster TQ2_0
compilade Jul 31, 2024
e971957
ggml : also faster TQ1_0
compilade Jul 31, 2024
a6dd699
ggml : fix build issues in certain environments
compilade Aug 1, 2024
5417089
ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
compilade Aug 1, 2024
45719a2
ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
compilade Aug 1, 2024
04eec58
ggml : remove q1_3 and q2_2
compilade Aug 2, 2024
f034aa1
ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
compilade Aug 3, 2024
96b3d41
ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
compilade Aug 7, 2024
d911cd1
Merge branch 'master' into compilade/bitnet-ternary
compilade Aug 11, 2024
3a0bf17
gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
compilade Aug 12, 2024
895004f
convert : allow direct conversion to TQ1_0 and TQ2_0
compilade Aug 13, 2024
69f7726
ggml-quants : allow using ARM dot product instructions for TQ1_0
compilade Aug 13, 2024
82b2404
Merge branch 'master' into compilade/bitnet-ternary
compilade Aug 13, 2024
35cc556
ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
compilade Aug 13, 2024
cb6d996
Merge branch 'master' into compilade/bitnet-ternary
compilade Aug 22, 2024
7f3a619
Merge branch 'master' into compilade/bitnet-ternary
compilade Sep 4, 2024
8d61607
ggml ; remove unused ggml_mul special case
compilade Sep 4, 2024
75b3a09
test-backend-ops : add TQ1_0 and TQ2_0 comments for later
compilade Sep 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 40 additions & 9 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,10 @@ def write_tensors(self):
break

for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
data: np.ndarray = data # type hint
data: np.ndarray # type hint
if len(data.shape) == 0:
# otherwise single-value tensors get squeezed
data = data.reshape((1,))
n_dims = len(data.shape)
data_dtype = data.dtype
data_qtype: gguf.GGMLQuantizationType | None = None
Expand Down Expand Up @@ -296,12 +299,33 @@ def write_tensors(self):
))

if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
# TODO: cleaner model-specific per-tensor types
# NOTE: Q1_3 is only relevant for BitNet b1.58
if (
self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3
and gguf.can_quantize_to_q1_3(data)
and not any(
self.match_model_tensor_name(new_name, key, None)
for key in [
gguf.MODEL_TENSOR.TOKEN_EMBD,
gguf.MODEL_TENSOR.OUTPUT,
]
)
):
data = gguf.quantize_q1_3(data)
assert data.dtype == np.uint8
data_qtype = gguf.GGMLQuantizationType.Q1_3

elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
data = gguf.quantize_bf16(data)
assert data.dtype == np.int16
data_qtype = gguf.GGMLQuantizationType.BF16

elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
elif (
self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0
or self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3
and gguf.can_quantize_to_q8_0(data)
):
data = gguf.quantize_q8_0(data)
assert data.dtype == np.uint8
data_qtype = gguf.GGMLQuantizationType.Q8_0
Expand Down Expand Up @@ -1412,6 +1436,12 @@ def write_tensors(self):
class BitnetModel(Model):
model_arch = gguf.MODEL_ARCH.BITNET

def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, *args, **kwargs):
if ftype == gguf.LlamaFileType.GUESSED:
ftype = gguf.LlamaFileType.MOSTLY_Q1_3

super().__init__(dir_model, ftype, *args, **kwargs)

def set_vocab(self):
self._set_vocab_sentencepiece()

Expand All @@ -1423,12 +1453,13 @@ def set_gguf_parameters(self):
def weight_quant(self, weight):
dtype = weight.dtype
weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5)
weight = (weight * s).round().clamp(-1, 1) / s
scale = weight.abs().max().unsqueeze(0)
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
weight = torch.sign(weight).type(dtype)
return weight.type(dtype), scale.type(torch.float32)
scale = weight.abs().mean().clamp(min=1e-5)
iscale = 1 / scale
weight = (weight * iscale).round().clamp(-1, 1)
# TODO: use the scale directly instead of inverting it twice
# (this is also unnecessarily doubly inverted upstream)
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
return weight.type(dtype), (1 / iscale).type(torch.float32)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
Expand Down
2 changes: 2 additions & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet b1.58", },
{ "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet b1.58", },
Copy link
Collaborator Author

@compilade compilade Jun 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the names of the new quant types, since these are quite specific to BitNet models, I was thinking to name them with something starting with QB, a bit like suggested in #5761 (comment).

I'll first be describing what I want from the naming scheme, then I'll attempt to make it work.

The naming scheme should have room for:

  • Ternary types in {-1, 0, 1}
    • 1.625 bpw quant with a block size of 64, with 13 bytes per block
      • To make the smallest possible lossless BitNet b1.58 model files
      • Uses Q8_0 as its vec_dot_type (for the activations)
      • (It's technically possible to store a float16 scale in the leftover bits in the last byte of 16 consecutive blocks (this means 1024 elements minimum per row), although it can't really be extracted with SIMD)
    • 2.000 bpw quant with a block size of 32, with 8 bytes per block
      • For maximal performance
      • Uses Q8_0 as its vec_dot_type (for the activations)
    • 2.000 bpw quant with a block size of 64, with 16 bytes per block, and a float16 scale
      • Values would be packed similarly to the 1.625 bpw type, but with an extra byte and a row-wise float16 scale duplicated in each block.
    • 2.000 bpw quant with a block size of 4, with 1 byte per block
      • For weirdly-shaped models like the 1.3B BitNet b1.58 model
      • Needs a compatible vec_dot_type
        • float types are slower than integer types for this
  • Binary types in {-1, 1}
    • 1 bpw type
  • Binary types in {0, 1}
    • Are there models which use this?
  • 8-bit activation with a row-wise scale
    • 8.5 bpw like Q8_0, but all the scales of a row are the same
      • Would allow reducing the number of float32 operations in the vec_dot of the above types.
    • 10 bpw, 5 bytes per block of 4 elements, with a weird layout which only uses blocks to get a big enough buffer, with a single float32 scale and some padding before all row elements, aligned and contiguous.
      • For use with the weird 2.000 bpw type, and also maybe the other ones for best performance.

So the naming scheme could be:

  • QB<x>_<y>
    • where <x> is the floor of the expected bpw of the type
    • where <y> is
      • 0 binary type, {0, 1}
        • except for QB8_0 which is like Q8_0 but with a guaranteed duplicated row-wise scale
      • 1 binary type, {-1, 1}
      • 2 ternary type using some kind of binary-coded ternary
      • 3 ternary type with fixed-point packed values
      • 4 weird type with a block size of 4

Which for the previously-mentioned possible BitNet types would mean:

proposed name Range bits per weight block size bytes row-wise scale current name
QB1_3 {-1, 0, 1} 1.625 64 13 1.0f Q1_3
QB2_2 {-2, -1, 0, 1} 2.000 32 8 1.0f Q2_2
QB2_3 {-1, 0, 1} 2.000 64 16 f16
QB2_4 {-2, -1, 0, 1} 2.000 4 1 1.0f
QB1_1 {-1, 1} 1.000 ? ?/8 1.0f
QB1_0 {0, 1} 1.000 ? ?/8 1.0f
QB8_0 [-127, 127] 8.5 32 34 f16
QB8_4 [-127, 127] 10 4 5 f32, weird layout

I'm not saying these should all exist, though, only that the naming scheme should not be too limiting for possible future extensions (which might not exist anyway due to lack of time).

So I think I'll rename Q1_3 to QB1_3, and Q2_2 to QB2_2. Anyone has comments on this? Or a better naming scheme for the new BitNet quant types?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it were me, considering this only works with bitnet models and nothing else, I'd want the designations to be exceptionally clear that they are different and shouldn't be used on just anything. "QB" is good, but I'd take it a step further and remove the Q entirely. As bitnet is being colloquially referred to as a "1-bit" model, B1 makes more sense. Considering the plausible range for weights, I'd cut it off at tenths and ditch the decimal. This leaves plenty of room for variations, while making the native BPW very clear. I feel this is superior to the arbitrary "_2" and "_3" subtypes.

So what I would propose is:

1.625bpw = B1_16
2.000bpw = B1_20

{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
Expand Down
2 changes: 2 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,8 @@ extern "C" {
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_Q2_2 = 31,
GGML_TYPE_Q1_3 = 32,
GGML_TYPE_COUNT,
};

Expand Down
50 changes: 50 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,20 @@ typedef sycl::half2 ggml_half2;

#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

// 1.625 bpw for BitNet b1.58 models
#define QK1_3 64
typedef struct {
uint8_t q[(QK1_3 - 4*QK1_3/64)/5]; // 5 elements per byte (3^5 = 243 < 256)
uint8_t qs[QK1_3/64]; // 4 elements per byte
} block_q1_3;
static_assert(sizeof(block_q1_3) == (QK1_3 - 4*QK1_3/64)/5 + QK1_3/64, "wrong q1_3 block size/padding");

#define QK2_2 32
typedef struct {
uint8_t qs[QK2_2 / 4]; // nibbles / quants
} block_q2_2;
static_assert(sizeof(block_q2_2) == QK2_2 / 4, "wrong q2_2 block size/padding");

#define QK4_0 32
typedef struct {
ggml_half d; // delta
Expand Down Expand Up @@ -333,6 +347,7 @@ typedef struct {
} block_iq3_s;
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");

// 1.5625 bpw
typedef struct {
ggml_half d;
uint8_t qs[QK_K/8];
Expand Down Expand Up @@ -1022,6 +1037,41 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
GGML_TABLE_END()

GGML_TABLE_BEGIN(uint32_t, q1_3_grid, 256)
0xffffffff, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff,
0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff000000, 0xff000001,
0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff,
0xff010000, 0xff010001, 0xff0101ff, 0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01,
0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00,
0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101,
0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000, 0x00010001, 0x000101ff, 0x00010100,
0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001,
0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000,
0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x0101ff01,
0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00,
0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff,
0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100,
0xff000101, 0xff01ffff, 0xff01ff00, 0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff,
0xff010100, 0xff010101, 0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0000,
0x00ff0001, 0x00ff01ff, 0x00ff0100, 0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff,
0x00000000, 0x00000001, 0x000001ff, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01,
0x000100ff, 0x00010000, 0x00010000, 0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff,
0x01ffff00, 0x01ffff01, 0x01ff00ff, 0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101,
0x0100ffff, 0x0100ff00, 0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x01000001, 0x010001ff,
0x01000100, 0x01000101, 0x0101ffff, 0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001,
0x010101ff, 0x01010100, 0x01010101, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000,
0xffff0001, 0xffff01ff, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01,
0xff0000ff, 0xff000000, 0xff000001, 0xff0001ff, 0xff000100, 0xff000101, 0xff01ffff, 0xff01ff00,
0xff01ff01, 0xff0100ff, 0xff010000, 0xff010001, 0xff0101ff, 0xff0101ff, 0xff010100, 0xff010101,
0x00ffffff, 0x00ffff00, 0x00ffff01, 0x00ff00ff, 0x00ff0000, 0x00ff0001, 0x00ff01ff, 0x00ff0100,
0x00ff0101, 0x0000ffff, 0x0000ff00, 0x0000ff01, 0x000000ff, 0x00000000, 0x00000001, 0x000001ff,
0x00000100, 0x00000100, 0x00000101, 0x0001ffff, 0x0001ff00, 0x0001ff01, 0x000100ff, 0x00010000,
0x00010001, 0x000101ff, 0x00010100, 0x00010101, 0x01ffffff, 0x01ffff00, 0x01ffff01, 0x01ff00ff,
0x01ff0000, 0x01ff0001, 0x01ff01ff, 0x01ff0100, 0x01ff0101, 0x01ff0101, 0x0100ffff, 0x0100ff00,
0x0100ff01, 0x010000ff, 0x01000000, 0x01000001, 0x010001ff, 0x01000100, 0x01000101, 0x0101ffff,
0x0101ff00, 0x0101ff01, 0x010100ff, 0x01010000, 0x01010001, 0x010101ff, 0x01010100, 0x01010101,
GGML_TABLE_END()

#define NGRID_IQ1S 2048
#define IQ1S_DELTA 0.125f
#define IQ1M_DELTA 0.125f
Expand Down
Loading
Loading