pytorch
diff --git a/‎torchao/_models/llama/benchmark_results.txt
Lines changed: 2 additions & 1 deletion b/‎torchao/_models/llama/benchmark_results.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎torchao/_models/llama/benchmarks.sh
Lines changed: 1 addition & 1 deletion b/‎torchao/_models/llama/benchmarks.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchao/_models/llama/generate.py
Lines changed: 4 additions & 2 deletions b/‎torchao/_models/llama/generate.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎torchao/_models/llama/model.py
Lines changed: 1 addition & 1 deletion b/‎torchao/_models/llama/model.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchao/quantization/prototype/gemlite/__init__.py b/‎torchao/quantization/prototype/gemlite/__init__.py
diff --git a/‎torchao/quantization/prototype/gemlite/core.py
Lines changed: 0 additions & 238 deletions b/‎torchao/quantization/prototype/gemlite/core.py
Lines changed: 0 additions & 238 deletions
diff --git a/‎torchao/quantization/prototype/gemlite/triton_kernels/__init__.py
Lines changed: 0 additions & 4 deletions b/‎torchao/quantization/prototype/gemlite/triton_kernels/__init__.py
Lines changed: 0 additions & 4 deletions
@@ -69,4 +69,5 @@ bs4
 20241008155928, tok/s= 49.45, mem/s= 214.18 GB/s, peak_mem= 7.81 GB, model_size= 4.33 GB quant: gemlite-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8
 20241008160515, tok/s= 51.74, mem/s= 224.09 GB/s, peak_mem= 7.79 GB, model_size= 4.33 GB quant: gemlite-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8
 
-20241029013738, tok/s= 12.81, mem/s=   1.40 GB/s, peak_mem=14.55 GB, model_size= 0.11 GB quant: gemlite-4-128, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-128 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241029013738, tok/s= 12.81, mem/s=   1.40 GB/s, peak_mem=14.55 GB, model_size= 0.11 GB quant: gemlite-4-128, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-128 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
+20241029015254, tok/s= 12.73, mem/s=   1.39 GB/s, peak_mem=14.55 GB, model_size= 0.11 GB quant: gemlite-4-128, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-128 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 
@@ -2,7 +2,7 @@ export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
 
 # README BENCHMARKS
 export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-4-128 --num_samples 1  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-4-128 --num_samples 1  --write_result benchmark_results.txt --batch_size 16
 
 
 # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt
 
@@ -240,6 +240,9 @@ def main(
             assert group_size in [64, 128, 256], f"group_size needs to be in [64, 128, 256], got {group_size} for gemlite-<W_nbits>-<group_size>"
             assert precision == torch.float16, f"gemlite only supports float16 precision, got {precision}"
 
+            quant_config = BaseQuantizeConfig(nbits=W_nbits, group_size=group_size, quant_zero=False, quant_scale=False, axis=1)
+            quant_config['weight_quant_params']['optimize'] = False
+
             def replace_fn(mod):
                 if not isinstance(mod, torch.nn.Linear):
                     return mod
@@ -250,8 +253,7 @@ def replace_fn(mod):
                 compute_dtype = mod.weight.dtype
                 input_dtype, output_dtype = DType.FP16, DType.FP16
 
-                quant_config = BaseQuantizeConfig(nbits=W_nbits, group_size=group_size, quant_zero=False, quant_scale=False, axis=1)
-                quant_config['weight_quant_params']['optimize'] = False
+
                 hqq_layer = HQQLinear(mod, quant_config=quant_config, compute_dtype=compute_dtype, device=device, del_orig=False)
                 orig_shape   = (out_features, in_features)
                 gemlite_linear = GemLiteLinearTriton(W_nbits=W_nbits, 
 
@@ -19,7 +19,7 @@ def prepare_inputs_for_model(inps, max_new_tokens=1):
     if inps.dim() > 2:
         raise ValueError(f"Expected input to be of dim 1 or 2, but got {inps.dim()}")
 
-    input_pos = torch.arange(0, inps.size(-1), device=inps.device)
+    input_pos = torch.arange(0, inps.numel(), device=inps.device)
     return (inps.view(1, -1), input_pos)
 
 @dataclass