pytorch · msaroufim · Apr 25, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/benchmarks/benchmark_hqq.py b/benchmarks/benchmark_hqq.py
@@ -0,0 +1,134 @@
+import torch
+from termcolor import colored
+
+import pandas as pd
+from hqq.core.quantize import HQQLinear, BaseQuantizeConfig
+from torchao.prototype.hqq.hqq_tinygemm_linear import HQQLinearTorchWeightOnlyInt4
+from torchao.prototype.hqq import triton_mixed_mm, pack_2xint4
+
+from triton.testing import do_bench
+
+
+BASE_QUANT_CONFIG = {
+    "optimize": True,
+    "view_as_float": False,
+    "nbits": 4,
+    "bitpack": False,
+    "axis": 1,
+}
+
+
+def bench_custom_kernel(x, W_q, scales, zeros, group_size, kernel_type="max_autotune", fp8_fast_accum=False):
+    packed_w = pack_2xint4(W_q.T)
+
+    def fn():
+        _ = triton_mixed_mm(
+            x,
+            packed_w,
+            scales.T,
+            zeros.T,
+            group_size=group_size,
+            fp8_fast_accum=fp8_fast_accum,
+            kernel_type=kernel_type,
+        )
+
+    t = do_bench(fn)
+    return t
+
+
+def bench_hqq(x, hqq_linear: HQQLinear):
+    def fn():
+        _ = hqq_linear.forward(x)
+
+    t = do_bench(fn)
+    return t
+
+
+def run_benchmark(shape, group_size, dtype, axis=1, quant_dtype=torch.uint8):
+    qcfg = {
+        **BASE_QUANT_CONFIG,
+        **dict(group_size=group_size, axis=axis),
+    }
+    M, N, K = shape
+
+    x = torch.randn(M, K, dtype=dtype, device="cuda")
+    linear = torch.nn.Linear(K, N, bias=False, dtype=dtype, device="cuda")
+
+    quant_config = BaseQuantizeConfig(
+        quant_zero=False, quant_scale=False, offload_meta=False, view_as_float=False
+    )
+    quant_config.update({"weight_quant_params": qcfg})
+
+    hqq_linear = HQQLinear(linear, quant_config, compute_dtype=dtype, del_orig=False)
+
+    # Reference
+    ref_time = bench_hqq(x, hqq_linear)
+
+    # Custom kernel
+    W_q, meta = hqq_linear.W_q, hqq_linear.meta
+    scales, zeros = meta["scale"], meta["zero"]
+
+    W_q = (
+        W_q.reshape(meta["shape"])
+        if quant_config["weight_quant_params"]["bitpack"] == False
+        else W_q
+    )
+    W_q = W_q.to(dtype=quant_dtype)
+    scales = scales.reshape(N, -1)
+    zeros = zeros.reshape(N, -1)
+    tt_time = bench_custom_kernel(x, W_q, scales, zeros, group_size)
+
+    if dtype == torch.bfloat16:
+        _ = quant_config["weight_quant_params"].pop("bitpack")
+        hqq_int4mm = HQQLinearTorchWeightOnlyInt4(
+            linear, quant_config, compute_dtype=dtype, del_orig=False
+        )
+        int4_time = bench_hqq(x, hqq_int4mm)
+
+    print(colored(f"{shape=} {group_size=} {dtype=}:", attrs=["bold"]))
+
+    print(
+        colored(f"Ref: {ref_time:.4f}", "blue"),
+        colored(f"Triton: {tt_time:.4f}", "green"),
+        colored(f"Torch int4mm: {int4_time:.4f}", "yellow")
+        if dtype == torch.bfloat16
+        else "",
+    )
+    print()
+    return ref_time, tt_time, int4_time if dtype == torch.bfloat16 else None
+
+
+SHAPES = [
+    [16, 4096, 4096],
+    [32, 4096, 4096],
+    [128, 4096, 4096],
+    [256, 4096, 4096],
+    [512, 4096, 4096],
+    [1024, 4096, 4096],
+]
+
+DTYPES = [torch.bfloat16]  # , torch.float16]
+GROUP_SIZES = [128]
+
+print(torch.cuda.get_device_properties(0))
+
+HEADERS = [
+    "M",
+    "N",
+    "K",
+    "group_size",
+    "dtype",
+    "ref",
+    "triton",
+    "tinygemm",
+]
+data = []
+for shape in SHAPES:
+    for group_size in GROUP_SIZES:
+        for dtype in DTYPES:
+            timings = run_benchmark(shape, group_size, dtype)
+            data.append((*shape, group_size, dtype, *timings))
+
+
+df = pd.DataFrame(data, columns=HEADERS)
+df.to_csv("benchmark_triton.csv", index=False)
diff --git a/test/hqq/test_triton_mm.py b/test/hqq/test_triton_mm.py
@@ -0,0 +1,101 @@
+import itertools
+
+import torch
+from termcolor import colored
+
+from hqq.core.quantize import HQQLinear, BaseQuantizeConfig
+from hqq.kernels.custom_quant.triton import triton_mixed_mm, pack_2xint4
+from torchao.prototype.hqq import triton_mixed_mm, pack_2xint4
+from torchao.prototype.hqq.hqq_tinygemm_linear import HQQLinearTorchWeightOnlyInt4
+
+
+#TODO: refactor to pytest
+
+#Test configs
+SHAPES = [
+    # [16, 128],
+    [16, 128, 128],
+    [16, 4096, 4096],
+    # [1024, 4096],
+    # [4096, 4096],
+    # [4096, 11008],
+]
+
+DTYPES = [torch.bfloat16, torch.float16]
+GROUP_SIZES = [64, 128]
+AXES = [1] #Only axis = 1 supported
+TRITON_KERNEL_TYPE = ["compute_bound"] #["max_autotune", "compute_bound"]
+TEST_CONFIGS = list(itertools.product(SHAPES, GROUP_SIZES, AXES, DTYPES, TRITON_KERNEL_TYPE))
+
+BASE_QUANT_CONFIG = {
+    "optimize": True,
+    "view_as_float": False,
+    "nbits": 4,
+    # "quant_dtype": torch.uint8,
+    "bitpack": False,
+    "axis": 1,
+}
+
+
+def check(expected, actual, cfg_str, max_diff=1e-3):
+    passed = torch.allclose(expected, actual, atol=max_diff, rtol=max_diff)
+    max_err = (expected - actual).abs().max()
+    if not passed:
+        print(colored(f"{cfg_str}: Failed! Max error: {max_err}", "red", attrs=["bold"]))
+    else:
+        print(colored(f"{cfg_str}: Passed! Max error: {max_err}", "green", attrs=["bold"]))
+
+def test_mixed_mm(shape, group_size, axis, dtype, kernel_type, quant_dtype=torch.uint8):
+    # print(f"Test: {shape}, {group_size}, {axis}, {dtype}")
+    qcfg = {
+        **BASE_QUANT_CONFIG,
+        **dict(group_size=group_size, axis=axis),
+    }
+    M, N, K = shape
+
+    x = torch.randn(M, K, dtype=dtype, device="cuda")
+    linear = torch.nn.Linear(K, N, bias=False, dtype=dtype, device="cuda")
+
+    quant_config = BaseQuantizeConfig(
+        quant_zero=False, quant_scale=False, offload_meta=False, view_as_float=False
+    )
+    quant_config.update({"weight_quant_params": qcfg})
+    hqq_linear = HQQLinear(linear, quant_config, compute_dtype=dtype, del_orig=False)
+    W_q, meta = hqq_linear.W_q, hqq_linear.meta
+    W_q = (
+        W_q.reshape(meta["shape"])
+        if quant_config["weight_quant_params"]["bitpack"] == False
+        else W_q
+    )
+    scales, zeros = meta["scale"], meta["zero"]
+
+    #Reference
+    hqq_out = hqq_linear.forward(x)
+
+    ##Triton
+    W_q = W_q.to(dtype=quant_dtype)
+    packed_w = pack_2xint4(W_q.T)
+    scales = scales.reshape(N, -1)
+    zeros = zeros.reshape(N, -1)
+    tt_out = triton_mixed_mm(
+        x, packed_w, scales.T, zeros.T, group_size=group_size, fp8_fast_accum=False, kernel_type=kernel_type
+    )
+
+    cfg_str = f"Test config {shape} {group_size} {dtype}"
+    # err = (hqq_out - tt_out).abs().max()
+    check(hqq_out, tt_out, cfg_str + " triton", max_diff=1e-2 if dtype == torch.bfloat16 else 1e-3)
+
+    if dtype == torch.bfloat16:
+        _ = quant_config["weight_quant_params"].pop("bitpack")
+        hqq_int4mm = HQQLinearTorchWeightOnlyInt4(
+            linear, quant_config, compute_dtype=dtype, del_orig=False
+        )
+        hqq_int4_out = hqq_int4mm.forward(x)
+        err = (hqq_int4_out - hqq_out).abs().max()
+        check(hqq_out, hqq_int4_out, cfg_str + " torch_tinygemm", max_diff=1e-2)
+
+    print()
+
+
+for test in TEST_CONFIGS:
+    test_mixed_mm(*test)
diff --git a/torchao/prototype/hqq/README.md b/torchao/prototype/hqq/README.md
@@ -0,0 +1,43 @@
+## Fused `int4 / fp16` Quant Matmul
+
+Fused gemm for asymmetric quantized weights. Tested and benchmarked for `HQQ` but could theoretically be used for any asymmetric quantization scheme.
+
+The kernel packs `u4 / s4` weights and fuses dequantization with the matmul.
+
+- tested for `float16 / bfloat16` activations, scales, and zeros
+- autotuned for both compute-bound and io-bound configs
+- assumes operand B of the `gemm` is is the quantized type.
+- requires quantization along `in-features`, i.e., the `K` dimension, or `axis=1`, of `torch.linear.weight`.
+
+### Performance
+
+Initial benchmarking demonstrates promising results, scaling well across io-bound and compute-bound workloads:
+
+|     | M    | N    | K    | group_size | dtype          | hqq_ref | triton | tinygemm |
+| --- | ---- | ---- | ---- | ---------- | -------------- | ------- | ------ | -------- |
+| 0   | 16   | 4096 | 4096 | 128        | torch.bfloat16 | 0.2675  | 0.0633 | 0.0382   |
+| 1   | 32   | 4096 | 4096 | 128        | torch.bfloat16 | 0.2669  | 0.0704 | 0.0649   |
+| 2   | 128  | 4096 | 4096 | 128        | torch.bfloat16 | 0.2689  | 0.0960 | 0.2523   |
+| 3   | 256  | 4096 | 4096 | 128        | torch.bfloat16 | 0.3268  | 0.1355 | 0.5192   |
+| 4   | 512  | 4096 | 4096 | 128        | torch.bfloat16 | 0.3628  | 0.2369 | 1.0892   |
+| 5   | 1024 | 4096 | 4096 | 128        | torch.bfloat16 | 0.5133  | 0.4753 | 2.2016   |
+
+- Times are in `ms`, see `benchmarks/benchmark_hqq.py`.
+- `hqq_ref` is the base `HQQ_Linear` [module](https://github.com/mobiusml/hqq/blob/6d50eee4bcdd99cc10716f1297c5b2803d2b6da4/hqq/core/quantize.py#L349) that is unfused (dequantization followed by call to torch.matmul).
+- `tinygemm` calls `torch.ops.aten._weight_int4pack_mm`. Implementation is a custom HQQLinear layer that wraps the preprocessing necessary for this kernel, adapted from a benchmark script posted by @mobicham from `CUDA-mode` Discord discussions.
+
+GPU details:
+
+```
+_CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48676MB, multi_processor_count=84)
+```
+
+### NOTE
+
+This implementation requires **`triton >= 3.0.0`**.
+
+- Running tests / benchmarks requires installation of `hqq`:
+
+  ```
+  pip install hqq
+  ```
diff --git a/torchao/prototype/hqq/__init__.py b/torchao/prototype/hqq/__init__.py
@@ -0,0 +1 @@
+from .mixed_mm import triton_mixed_mm, pack_2xint4
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .mixed_mm import triton_mixed_mm, pack_2xint4