Activation Aware Weight Quantization (AWQ) (#743)

vayuda · jainapurva · commit d12f38cfd8da · 2024-10-09T16:32:54.000-07:00
Integrate AWQ within the TorchAO framework
diff --git a/scripts/create_weight_map.py b/scripts/create_weight_map.py
@@ -0,0 +1,43 @@
+import json
+import torch
+from transformers import AutoModel
+from pathlib import Path
+def create_weight_map(checkpoint_dir: Path):
+    """
+    This function, create_weight_map, generates a mapping of a model's weights to a file (pytorch_model.bin) 
+    and saves this mapping, along with the model's total size, to a JSON file (pytorch_model.bin.index.json). 
+    The model is loaded from a pre-trained model specified by model_name.
+    This weight map is used by the HF conversion script (convert_hf_checkpoint.py).
+    """
+    # Load the model
+    model_name = checkpoint_dir.parent.name +"/"+ checkpoint_dir.name
+    print(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    # Get the state dict
+    state_dict = model.state_dict()
+    # Create the weight map
+    weight_map = {}
+    for key, tensor in state_dict.items():
+        # In this example, we're assuming all weights are in a single file
+        # You may need to adjust this if your model uses sharded weights
+        weight_map[key] = "pytorch_model.bin"
+    # Create the index dictionary
+    index_dict = {
+        "metadata": {"total_size": sum(param.numel() * param.element_size() for param in model.parameters())},
+        "weight_map": weight_map
+    }
+    # Save the index dictionary to a JSON file
+    with open(f"{checkpoint_dir}/pytorch_model.bin.index.json", "w") as f:
+        json.dump(index_dict, f, indent=2)
+    print("Created pytorch_model.bin.index.json")
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Create weight map for hf model')
+    parser.add_argument('--checkpoint_dir', type=Path, default=Path("checkpoints/Xenova/llama2.c-stories15M"))
+    
+
+    args = parser.parse_args()
+    create_weight_map(
+        args.checkpoint_dir
+    )
diff --git a/scripts/hf_eval.py b/scripts/hf_eval.py
@@ -48,7 +48,7 @@ def format_value(value):
 def run_evaluation(repo_id, tasks, limit, device, precision, quantization, sparsity, compile, save, batch_size, max_length):
 
     tokenizer = AutoTokenizer.from_pretrained(repo_id)
-    model = AutoModelForCausalLM.from_pretrained(repo_id).to(dtype=precision, device=device)
+    model = AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype=precision).to(device)
 
     if quantization == "autoquant" and compile:
         model = torch.compile(model, mode="max-autotune", fullgraph=True)
@@ -64,9 +64,29 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, spars
         quantize_(model, fpx_weight_only(3, 2))
     elif quantization == "autoquant":
         model = autoquant(model.to(device=device))
+    elif quantization == "awq":
+        from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
+        from torchao.prototype.awq.example import get_calib_dataset
+        if not TORCH_VERSION_AT_LEAST_2_3:
+            print("AWQ quantization requires torch2.3+")
+            exit()
+        from torchao.prototype.awq import insert_awq_observer_, awq_uintx, AWQObservedLinear
+        quant_dtype = torch.uint4
+        group_size = 64
+        calibration_limit = 10
+        calibration_seq_length = 1024
+        model=model.to(device)
+        insert_awq_observer_(model,calibration_limit, calibration_seq_length, quant_dtype=quant_dtype, group_size=group_size)
+        with torch.no_grad():
+            calibration_data = get_calib_dataset(tokenizer=tokenizer, n_samples=calibration_limit, block_size=calibration_seq_length)
+            for batch in calibration_data:
+                model(batch.to(device))
+                del batch
+        is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear)
+        quantize_(model, awq_uintx(quant_dtype=quant_dtype, group_size = group_size), is_observed_linear)
 
     if quantization != "autoquant" and compile:
-        model = torch.compile(model, mode="max-autotune", fullgraph=True)
+        model = torch.compile(model, mode= "max-autotune", fullgraph=True)
 
     if sparsity == "semi_sparse":
         def all_linear(mod, name):
@@ -114,7 +134,7 @@ def all_linear(mod, name):
     parser.add_argument('--limit', type=int, default=None, help='Number of eval samples to evaluate')
     parser.add_argument('--precision', type=lambda x: getattr(torch, x.split(".")[-1]), default=torch.bfloat16, help='dtype precision to use')
     parser.add_argument('--device', type=str, default="cuda", help='Device to use for evaluation')
-    parser.add_argument('-q', '--quantization', default = "None", choices=["int8dq", "int8wo", "int4wo", "autoquant", "None"], help='Which quantization technique to apply')
+    parser.add_argument('-q', '--quantization', default = "None", choices=["int8dq", "int8wo", "int4wo","autoquant", "awq", "None"], help='Which quantization technique to apply')
     parser.add_argument('-s', '--sparsity', default = "None", choices=["semi_sparse", "semi_sparse_mlp_only", "None"], help='Which sparsity technique to apply')
     parser.add_argument('--compile', action='store_true', help='Whether to compile the model.')
     parser.add_argument('--save', action='store_true', help='Whether to save the model.')
diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from torchao.dtypes.uintx.uintx import to_uintx
+from torchao.dtypes.uintx import to_uintx
 from torchao.quantization.quant_api import quantize_, uintx_weight_only
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_3,
diff --git a/test/prototype/test_awq.py b/test/prototype/test_awq.py
@@ -0,0 +1,129 @@
+from copy import deepcopy
+import os
+import pytest
+import torch
+from torchao.quantization import quantize_
+
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_3, TORCH_VERSION_AT_LEAST_2_5
+if TORCH_VERSION_AT_LEAST_2_3:
+    from torchao.prototype.awq import insert_awq_observer_, awq_uintx, AWQObservedLinear
+
+class ToyLinearModel(torch.nn.Module):
+    def __init__(self, m=512, n=256, k=128):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(m, n, bias=False)
+        self.linear2 = torch.nn.Linear(n, k, bias=False)
+        self.linear3 = torch.nn.Linear(k, 1, bias=False)
+
+    def example_inputs(self, batch_size, sequence_length=10, dtype=torch.bfloat16, device="cuda"):
+        return [torch.randn(1, sequence_length, self.linear1.in_features, dtype=dtype, device=device) for j in range(batch_size)]
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+devices = ["cpu", "cuda"]
+# torch.uintx dtypes are introduced in 2.3
+if TORCH_VERSION_AT_LEAST_2_3:
+    qdtypes = (torch.uint4, torch.uint7)
+else:
+    qdtypes = ()
+    
+@pytest.fixture(autouse=True)
+def run_before_and_after_tests():
+    yield
+    torch._dynamo.reset() # reset cache between tests
+    
+@pytest.mark.parametrize("device", devices)   
+@pytest.mark.parametrize("qdtype", qdtypes)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_5,reason="requires nightly pytorch")
+def test_awq_loading(device, qdtype):
+    if qdtype == torch.uint4 and device == "cpu":
+        pytest.skip("uint4 not supported on cpu")
+        
+    dataset_size = 100
+    l1,l2,l3 = 512,256,128
+    original_dtype = torch.bfloat16 # tinygemm kernel only uses bfloat16 inputs
+    quant_dtype = qdtype
+    group_size = 128
+    n_calibration_examples = 10
+    n_validation_examples = 10
+    sequence_length = 5
+
+    m = ToyLinearModel(l1,l2,l3).eval().to(original_dtype).to(device)
+    dataset = m.example_inputs(dataset_size, sequence_length=sequence_length,  dtype=original_dtype, device=device)
+    calibration_data = dataset[:n_calibration_examples]
+
+    # calibrate
+    insert_awq_observer_(m, n_validation_examples, sequence_length, quant_dtype=quant_dtype, group_size=group_size)
+
+    for example in calibration_data:
+        m(example.to(device))
+
+    
+    # quantize
+    is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear)
+    quantize_(m, awq_uintx(quant_dtype = quant_dtype, group_size = group_size), is_observed_linear)
+    
+    model_save_path = "awq_model.pth"
+    torch.save(m, model_save_path)
+    loaded_model = torch.load(model_save_path)
+    os.remove(model_save_path)
+    
+    if torch.cuda.is_available():
+        m = torch.compile(m, fullgraph=True)
+        loaded_model = torch.compile(loaded_model, fullgraph=True)
+    
+    awq_out = torch.cat([m(i.squeeze(0)) for i in dataset])
+    awq_save_load_out = torch.cat([loaded_model(i.squeeze(0)) for i in dataset])
+    
+    assert awq_out is not None
+    assert awq_save_load_out is not None
+    assert torch.allclose(awq_out, awq_save_load_out, atol = 1e-2)
+
+@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_5,reason="requires nightly pytorch")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_save_weights_only():
+    dataset_size = 100
+    l1,l2,l3 = 512,256,128
+    original_dtype = torch.bfloat16
+    quant_dtype = torch.uint4
+    device = "cuda"
+    group_size = 128
+    n_calibration_examples = 10
+    n_validation_examples = 10
+    sequence_length = 5
+
+    m = ToyLinearModel(l1,l2,l3).eval().to(original_dtype).to(device)
+    m2 = deepcopy(m)
+    dataset = m.example_inputs(dataset_size, sequence_length=sequence_length,  dtype=original_dtype, device=device)
+    calibration_data = dataset[:n_calibration_examples]
+
+    # calibrate
+    insert_awq_observer_(m, n_validation_examples, sequence_length, quant_dtype=quant_dtype, group_size=group_size)
+
+    for example in calibration_data:
+        m(example.to(device))
+
+    
+    # quantize
+    is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear)
+    quantize_(m, awq_uintx(quant_dtype = quant_dtype, group_size = group_size), is_observed_linear)
+    
+    model_save_path = "awq_model.pth"
+    torch.save(m.state_dict(), model_save_path)
+    m2.load_state_dict(torch.load(model_save_path), assign=True) # load weights only.torch.load(model_save_path)
+    os.remove(model_save_path)
+    
+    m = torch.compile(m, fullgraph=True)
+    m2 = torch.compile(m2, fullgraph=True)
+
+    awq_out = torch.cat([m(i.squeeze(0)) for i in dataset])
+    awq_save_load_out = torch.cat([m2(i.squeeze(0)) for i in dataset])
+    
+    assert awq_out is not None
+    assert awq_save_load_out is not None
+    assert torch.allclose(awq_out, awq_save_load_out, atol = 1e-2)
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -255,4 +255,4 @@ def run_evaluation(
         args.calibration_limit,
         args.calibration_seq_length,
         args.pad_calibration_inputs,
-    )
+    )
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -161,6 +161,8 @@ def main(
     temperature: float = 0.8,
     checkpoint_path: Path = Path("checkpoints/meta-Transformer/Transformer-2-7b-chat-hf/model.pth"),
     quantization: Optional[str] = None,
+    calibration_limit: int = 10,
+    calibration_seq_length: int = 256,
     kv_cache_quantization: bool = False,
     cache_size: Optional[int] = None,
     linear_causal_mask: bool=False,
@@ -232,6 +234,33 @@ def main(
             quantize_(model, int4_weight_only(layout_type=MarlinSparseLayoutType()))
         if "fp6" in quantization:
             quantize_(model, fpx_weight_only(3, 2))
+        if quantization.startswith("awq"):
+            from torchao._models._eval import TransformerEvalWrapper
+            from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
+            from torchao.prototype.awq.example import get_calib_dataset
+            if not TORCH_VERSION_AT_LEAST_2_3:
+                print("Awq requires torch2.3+")
+                exit()
+            from torchao.prototype.awq import insert_awq_observer_, awq_uintx, AWQObservedLinear
+            quant_dtype = quantization.split("-")[1]
+            group_size = int(quantization.split("-")[2])
+            quant_dtype = getattr(torch, quant_dtype, torch.uint8)
+            model=model.to(device)
+            # get calibration data
+            insert_awq_observer_(model, calibration_limit, calibration_seq_length, quant_dtype=quant_dtype, group_size=group_size)
+            TransformerEvalWrapper(
+                model=model.to(device),
+                tokenizer=tokenizer,
+                max_seq_length=calibration_seq_length,
+                input_prep_func=prepare_inputs_for_model,
+                device=device,
+            ).run_eval(
+                tasks=['wikitext'], 
+                limit=calibration_limit,
+            )
+            is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear)
+            use_hqq = "hqq" in quantization
+            quantize_(model, awq_uintx(quant_dtype=quant_dtype, group_size = group_size, use_hqq=use_hqq), is_observed_linear)
         if "uintx" in quantization:
             # uintx-nbits-groupsize, e.g. "uintx-2-64"
             if "hqq" in quantization:
@@ -434,6 +463,8 @@ def callback(x):
             +'autoquant-int4, autoquant-float8, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin'
         )
     )
+    parser.add_argument("--calibration_limit", type=int, default=10, help="Number of calibration examples")
+    parser.add_argument("--calibration_seq_length", type=int, default=256, help="Sequence length for calibration")
     parser.add_argument('--kv_cache_quantization', action='store_true', help='Whether to quantize the KV cache')
     parser.add_argument('--cache_size', type=int, default=None, help='Force size of cache to be a certain number of tokens, if not set, will use max_new_tokens+prompt_size')
     parser.add_argument('--linear_causal_mask', action='store_true', help='Whether to use the memory efficient, but slightly less fast, linear causal mask (important for long context lengths)')
@@ -449,5 +480,5 @@ def callback(x):
     args = parser.parse_args()
     main(
         args.prompt, args.interactive, args.num_samples, args.max_new_tokens, args.top_k,
-        args.temperature, args.checkpoint_path, args.quantization, args.kv_cache_quantization, args.cache_size, args.linear_causal_mask, args.save, args.compile, args.compile_prefill, args.profile, args.memory_profile, args.device, args.precision, args.write_result
+        args.temperature, args.checkpoint_path, args.quantization, args.calibration_limit, args.calibration_seq_length, args.kv_cache_quantization, args.cache_size, args.linear_causal_mask, args.save, args.compile, args.compile_prefill, args.profile, args.memory_profile, args.device, args.precision, args.write_result
     )
diff --git a/torchao/dtypes/uintx/__init__.py b/torchao/dtypes/uintx/__init__.py
@@ -0,0 +1 @@
+from .uintx import UintxTensor, UintxLayoutType, UintxAQTLayout, to_uintx, _DTYPE_TO_BIT_WIDTH
diff --git a/torchao/dtypes/uintx/uintx.py b/torchao/dtypes/uintx/uintx.py
@@ -30,7 +30,7 @@
 
     _BIT_WIDTH_TO_DTYPE = {v: k for k, v in _DTYPE_TO_BIT_WIDTH.items()}
 else:
-    print("uintx feature need torch 2.3+, please upgrade pytorch")
+    print("uintx feature requires torch 2.3+, please upgrade pytorch")
 
 
 class UintxTensor(TorchAOBaseTensor):
diff --git a/torchao/prototype/awq/README.md b/torchao/prototype/awq/README.md
@@ -0,0 +1,29 @@
+# AWQ Quantization
+Adapted from https://github.com/mit-han-lab/llm-awq
+
+## Benchmarks
+Evaluation perplexity numbers were calculated using the script in awq/example.py Group size of 64 was used for all quantization methods. For Llama-2-7b-chat-hf, performance benchmarks were calculated using the torchao/_models/llama/generate.py script and run on a 1xA100 80GB SXM4 instance. The awq-uint4 quantization method does not use an efficient fused kernel which is why performance is not great. awq-hqq uses tinygemm int4->bf16 kernel + hqq to provide better performance.
+
+| Model              | Quantization | Tokens/sec | Throughput (GB/sec) | Peak Mem (GB) | Model Size (GB) |
+|--------------------|--------------|------------|---------------------|---------------|-----------------|
+| Llama-2-7b-chat-hf | bfloat16     | 107.38     | 1418.93             | 13.88         | 13.21           |
+|                    | awq-hqq-int4 | 196.6      | 761.2               | 5.05          | 3.87            |
+|                    | awq-uint4    | 43.59      | 194.93              | 7.31          | 4.47            |
+|                    | int4wo-hqq   | 209.19     | 804.32              | 4.89          | 3.84            |
+|                    | int4wo-64    | 201.14     | 751.42              | 4.87          | 3.74            |
+
+
+
+The following tests were performed using LM eval and groupsize = 128
+| Model              | Quantization | Perplexity | Truthful QA MC2 | WinoGrande | ARC challenge | 
+| Llama-3-8B-Instruct| bfloat16     | 10.936     | 0.540           | 0.783      | 0.567         |
+|                    | awq-hqq-int4 | 11.383     | 0.522           | 0.772      | 0.543         |
+|                    | awq-uint4    | 11.409     | 0.519           | 0.756      | 0.577         |
+|                    | int4wo-hqq   | 11.905     | 0.528           | 0.757      | 0.563         |
+|                    | int4wo-128   | 12.380     | 0.502           | 0.753      | 0.548         |
+
+
+
+
+
+
diff --git a/torchao/prototype/awq/__init__.py b/torchao/prototype/awq/__init__.py
@@ -0,0 +1,2 @@
+from .api import insert_awq_observer_, awq_uintx
+from .core import AWQObservedLinear
diff --git a/torchao/prototype/awq/api.py b/torchao/prototype/awq/api.py
diff --git a/torchao/prototype/awq/core.py b/torchao/prototype/awq/core.py
diff --git a/torchao/prototype/awq/example.py b/torchao/prototype/awq/example.py
diff --git a/torchao/quantization/linear_activation_scale.py b/torchao/quantization/linear_activation_scale.py
diff --git a/torchao/quantization/observer.py b/torchao/quantization/observer.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .uintx import UintxTensor, UintxLayoutType, UintxAQTLayout, to_uintx, _DTYPE_TO_BIT_WIDTH`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .api import insert_awq_observer_, awq_uintx`
	`2`	`+from .core import AWQObservedLinear`