From 575d12f6db6b7efe731212ecf96a6eeda2c9e0e7 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Fri, 16 May 2025 22:10:05 +0000
Subject: [PATCH 1/5] p1

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/base.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index 4201911e96..f9593fe304 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -466,33 +466,41 @@ def _apply_smoothing(self, model: Module) -> None:
                     inp, w_mean, x_mean, module2inspect, balance_layers, fp16_output
                 )
 
-            scales = best_scales
-
             @torch.no_grad()
             def smooth(module):
                 with align_module_device(module):
+                    scales = best_scales.to(module.weight.device)
                     if module in balance_layers:
-                        module.weight.mul_(scales.view(1, -1).to(module.weight.device))
+                        update_offload_parameter(
+                            module,
+                            "weight",
+                            module.weight.mul_(scales.view(1, -1)),
+                        )
                     elif module == smooth_layer:
                         if module.weight.ndim == 1:
                             update_offload_parameter(
                                 module,
                                 "weight",
-                                module.weight.div(scales.to(module.weight.device)),
+                                module.weight.div(scales),
                             )
                         else:
+                            # https://github.com/casper-hansen/AutoAWQ/blob/main/awq/quantize/scale.py#L123
+                            # TODO fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1))
+                            weight = module.weight
+                            weight[-scales.size(0) :].div_(scales.view(-1, 1))
                             update_offload_parameter(
                                 module,
                                 "weight",
-                                module.weight.div(
-                                    scales.view(-1, 1).to(module.weight.device)
-                                ),
+                                weight,
+                                # module.weight.div(
+                                #     scales.view(-1, 1).to(module.weight.device)
+                                # ),
                             )
                         if hasattr(module, "bias") and module.bias is not None:
                             update_offload_parameter(
                                 module,
                                 "bias",
-                                module.bias.div(scales.to(module.bias.device)),
+                                module.bias.div(scales),
                             )
 
             parent = get_fsdp_parent(mapping.smooth_name, model)

From 35a03993a67361a33736a96e1ca6d2e13915a332 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Fri, 16 May 2025 22:35:46 +0000
Subject: [PATCH 2/5] more to do

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index f9593fe304..f6705c7dca 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -488,6 +488,7 @@ def smooth(module):
                             # TODO fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1))
                             weight = module.weight
                             weight[-scales.size(0) :].div_(scales.view(-1, 1))
+                            # weight.transpose(1, 0).div_(scales.view(-1, 1))
                             update_offload_parameter(
                                 module,
                                 "weight",

From 58a715d621f9e94fc3616a67afddf725c46e3636 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Mon, 19 May 2025 20:25:15 +0000
Subject: [PATCH 3/5] only scale subset with valid shape

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/base.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index f6705c7dca..efc5488f6a 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -481,27 +481,30 @@ def smooth(module):
                             update_offload_parameter(
                                 module,
                                 "weight",
-                                module.weight.div(scales),
+                                module.weight.div_(scales),
                             )
                         else:
+                            # NOTE: edge case when smooth layer number of out_features
+                            # is not equal to balance layer number of in_features
+                            # e.g. when fused qkv_proj is used to smooth o_proj
+                            # in this case, default to scaling the last output features
+                            # because the desired smooth layer is v_proj
                             # https://github.com/casper-hansen/AutoAWQ/blob/main/awq/quantize/scale.py#L123
-                            # TODO fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1))
                             weight = module.weight
-                            weight[-scales.size(0) :].div_(scales.view(-1, 1))
-                            # weight.transpose(1, 0).div_(scales.view(-1, 1))
+                            if module.out_features == scales.numel():
+                                weight.div_(scales.view(-1, 1))
+                            else:
+                                weight[-scales.size(0) :].div_(scales.view(-1, 1))
                             update_offload_parameter(
                                 module,
                                 "weight",
                                 weight,
-                                # module.weight.div(
-                                #     scales.view(-1, 1).to(module.weight.device)
-                                # ),
                             )
                         if hasattr(module, "bias") and module.bias is not None:
                             update_offload_parameter(
                                 module,
                                 "bias",
-                                module.bias.div(scales),
+                                module.bias.div_(scales),
                             )
 
             parent = get_fsdp_parent(mapping.smooth_name, model)

From 876bed0bf8679ff6ec4b4b8ae9241eb756446450 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Tue, 20 May 2025 18:44:32 +0000
Subject: [PATCH 4/5] update to skip qkv_proj->o_proj mappings on shape
 condition

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/base.py | 28 ++++++++++++++++---------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index efc5488f6a..612670d472 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -311,13 +311,24 @@ def _set_resolved_mappings(self, model: Module) -> None:
                             continue
 
                         # exclude v_proj/o_proj mappings whose shapes are incompatible
+                        # exclude fused qkv_proj/o_proj mappings when shapes are not all equal
                         # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777
                         if (
-                            ".v_proj" in layer_name
-                            and ".o_proj" in balance_name
-                            and isinstance(smooth_layer, torch.nn.Linear)
+                            isinstance(smooth_layer, torch.nn.Linear)
                             and isinstance(balance_layer, torch.nn.Linear)
-                            and smooth_layer.weight.shape != balance_layer.weight.shape
+                            and ".o_proj" in balance_name
+                            and (
+                                (
+                                    ".v_proj" in layer_name
+                                    and smooth_layer.out_features
+                                    != balance_layer.in_features
+                                )
+                                or (
+                                    ".qkv_proj" in layer_name
+                                    and smooth_layer.out_features
+                                    != 3 * balance_layer.in_features
+                                )
+                            )
                         ):
                             num_skipped_oproj_mappings += 1
                             continue
@@ -490,15 +501,12 @@ def smooth(module):
                             # in this case, default to scaling the last output features
                             # because the desired smooth layer is v_proj
                             # https://github.com/casper-hansen/AutoAWQ/blob/main/awq/quantize/scale.py#L123
-                            weight = module.weight
-                            if module.out_features == scales.numel():
-                                weight.div_(scales.view(-1, 1))
-                            else:
-                                weight[-scales.size(0) :].div_(scales.view(-1, 1))
                             update_offload_parameter(
                                 module,
                                 "weight",
-                                weight,
+                                module.weight[-scales.size(0) :].div_(
+                                    scales.view(-1, 1)
+                                ),
                             )
                         if hasattr(module, "bias") and module.bias is not None:
                             update_offload_parameter(

From 5f06c067e5277b97f710246bb9a962b0250d19b3 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Tue, 20 May 2025 18:59:13 +0000
Subject: [PATCH 5/5] style fixes

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index 612670d472..49fc491fc5 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -310,8 +310,7 @@ def _set_resolved_mappings(self, model: Module) -> None:
                         if not balance_layer:
                             continue
 
-                        # exclude v_proj/o_proj mappings whose shapes are incompatible
-                        # exclude fused qkv_proj/o_proj mappings when shapes are not all equal
+                        # exclude v_proj->o_proj mappings whose shapes are incompatible
                         # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777
                         if (
                             isinstance(smooth_layer, torch.nn.Linear)