- 
                Notifications
    
You must be signed in to change notification settings  - Fork 53
 
Open
Description
Hi
I test bitblas models with the https://github.com/ModelCloud/GPTQModel repo.
I found that the output is correct. However, BitBLAS obtains similar token generation speed in low-bits (2-bit and 4-bit) model with FP16 model.  Detailed results are as follow:

the corresponding test code is:
import torch
from transformers import AutoTokenizer
from gptqmodel import GPTQModel, QuantizeConfig, get_backend
import time
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default=None, type=str, help="direction for saving quantization model")
    parser.add_argument("--wbits", type=int, default=4, help="quantization bits")
    parser.add_argument("--group_size", type=int, default=128, help="quantization group size")
    parser.add_argument("--test_speed", action="store_true")
    
    args = parser.parse_args()
    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False,legacy=False)
    model = GPTQModel.from_quantized(args.model, device_map='auto',torch_dtype=torch.float16,backend=get_backend('BITBLAS'))
    model.cuda()
    print(f"memory footprint after loading quantized model: {torch.cuda.max_memory_allocated('cuda') / 1024**3:.2f}GiB")
    if args.test_speed:
        prompt = "Write a poem about large language model:"
        input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
        start_time = time.time()
        output = model.generate(inputs=input_ids, do_sample=True, top_k=10, max_new_tokens=256)
        end_time = time.time()
        speed = len(output[0])/(end_time-start_time)
        print(tokenizer.decode(output[0]))
        print(f"generation speed:{speed}token/s")
        
if __name__ =='__main__':
    main()
Do you know what is the potential problem to hinder speedup. Thank you.
Metadata
Metadata
Assignees
Labels
No labels