@@ -240,23 +240,6 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i
240240 return False
241241
242242 def write_tensors (self ):
243- # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
244- def np_fp32_to_bf16 (n : np .ndarray ):
245- # force nan to quiet
246- n = np .where ((n & 0x7fffffff ) > 0x7f800000 , (n & 0xffff0000 ) | (64 << 16 ), n )
247- # flush subnormals to zero
248- n = np .where ((n & 0x7f800000 ) == 0 , n & 0x80000000 , n )
249- # round to nearest even
250- n = (n + (0x7fff + ((n >> 16 ) & 1 ))) >> 16
251- return n .astype (np .int16 )
252-
253- # Doing this row-wise is much, much faster than element-wise, hence the signature
254- v_fp32_to_bf16 = np .vectorize (np_fp32_to_bf16 , otypes = [np .int16 ], signature = "(n)->(n)" )
255- if self .lazy :
256- # TODO: find a way to implicitly wrap np.vectorize functions
257- # NOTE: the type is changed to reflect otypes passed to np.vectorize above
258- v_fp32_to_bf16 = gguf .LazyNumpyTensor ._wrap_fn (v_fp32_to_bf16 , meta_noop = np .int16 )
259-
260243 max_name_len = max (len (s ) for _ , s in self .tensor_map .mapping .values ()) + len (".weight," )
261244
262245 for name , data_torch in self .get_tensors ():
@@ -309,27 +292,31 @@ def np_fp32_to_bf16(n: np.ndarray):
309292 ))
310293
311294 if self .ftype != gguf .LlamaFileType .ALL_F32 and extra_f16 and not extra_f32 :
312- if self .ftype == gguf .LlamaFileType .MOSTLY_F16 :
295+ if self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
296+ data = gguf .quantize_bf16 (data )
297+ assert data .dtype == np .int16
298+ data_qtype = gguf .GGMLQuantizationType .BF16
299+
300+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q8_0 and gguf .can_quantize_to_q8_0 (data ):
301+ data = gguf .quantize_q8_0 (data )
302+ assert data .dtype == np .uint8
303+ data_qtype = gguf .GGMLQuantizationType .Q8_0
304+
305+ else : # default to float16 for quantized tensors
313306 if data_dtype != np .float16 :
314307 data = data .astype (np .float16 )
315308 data_qtype = gguf .GGMLQuantizationType .F16
316309
317- elif self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
318- if data_dtype != np .float32 :
319- data = data .astype (np .float32 )
320- data = v_fp32_to_bf16 (data .view (np .int32 ))
321- assert data .dtype == np .int16
322- data_qtype = gguf .GGMLQuantizationType .BF16
323-
324- else : # by default, convert to float32
310+ if data_qtype is None : # by default, convert to float32
325311 if data_dtype != np .float32 :
326312 data = data .astype (np .float32 )
327313 data_qtype = gguf .GGMLQuantizationType .F32
328314
329- assert data_qtype is not None
330-
315+ block_size , type_size = gguf .GGML_QUANT_SIZES [data_qtype ]
331316 # reverse shape to make it similar to the internal ggml dimension order
332- shape_str = f"{{{ ', ' .join (str (n ) for n in reversed (data .shape ))} }}"
317+ shape_str = f"""{{{ ', ' .join (str (n ) for n in reversed (
318+ (* data .shape [:- 1 ], data .shape [- 1 ] * data .dtype .itemsize // type_size * block_size ))
319+ )} }}"""
333320
334321 # n_dims is implicit in the shape
335322 logger .info (f"{ f'%-{ max_name_len } s' % f'{ new_name } ,' } { old_dtype } --> { data_qtype .name } , shape = { shape_str } " )
@@ -2415,25 +2402,15 @@ class LazyTorchTensor(gguf.LazyBase):
24152402 def numpy (self ) -> gguf .LazyNumpyTensor :
24162403 dtype = self ._dtype_map [self .dtype ]
24172404 return gguf .LazyNumpyTensor (
2418- meta = np . lib . stride_tricks . as_strided ( np . zeros ( 1 , dtype ) , self .shape , ( 0 for _ in self . shape ) ),
2405+ meta = gguf . LazyNumpyTensor . meta_with_dtype_and_shape ( dtype , self .shape ),
24192406 lazy = self ._lazy ,
24202407 args = (self ,),
24212408 func = (lambda s : s [0 ].numpy ())
24222409 )
24232410
24242411 @classmethod
2425- def eager_to_meta (cls , t : Tensor ) -> Tensor :
2426- if t .is_meta :
2427- return t
2428- return t .detach ().to ("meta" )
2429-
2430- @classmethod
2431- def meta_with_dtype (cls , m : Tensor , dtype : torch .dtype ) -> Tensor :
2432- m = m .detach ()
2433- if not m .is_meta :
2434- m = m .to ("meta" )
2435- m .dtype = dtype
2436- return m
2412+ def meta_with_dtype_and_shape (cls , dtype : torch .dtype , shape : torch .Size ) -> Tensor :
2413+ return torch .empty (size = shape , dtype = dtype , device = "meta" )
24372414
24382415 @classmethod
24392416 def __torch_function__ (cls , func , types , args = (), kwargs = None ):
@@ -2464,8 +2441,8 @@ def parse_args() -> argparse.Namespace:
24642441 help = "path to write to; default: based on input. {ftype} will be replaced by the outtype." ,
24652442 )
24662443 parser .add_argument (
2467- "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "auto" ], default = "f16" ,
2468- help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
2444+ "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , " auto" ], default = "f16" ,
2445+ help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
24692446 )
24702447 parser .add_argument (
24712448 "--bigendian" , action = "store_true" ,
@@ -2523,6 +2500,7 @@ def main() -> None:
25232500 "f32" : gguf .LlamaFileType .ALL_F32 ,
25242501 "f16" : gguf .LlamaFileType .MOSTLY_F16 ,
25252502 "bf16" : gguf .LlamaFileType .MOSTLY_BF16 ,
2503+ "q8_0" : gguf .LlamaFileType .MOSTLY_Q8_0 ,
25262504 "auto" : gguf .LlamaFileType .GUESSED ,
25272505 }
25282506
0 commit comments