From 575d12f6db6b7efe731212ecf96a6eeda2c9e0e7 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Fri, 16 May 2025 22:10:05 +0000 Subject: [PATCH 1/5] p1 Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 4201911e96..f9593fe304 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -466,33 +466,41 @@ def _apply_smoothing(self, model: Module) -> None: inp, w_mean, x_mean, module2inspect, balance_layers, fp16_output ) - scales = best_scales - @torch.no_grad() def smooth(module): with align_module_device(module): + scales = best_scales.to(module.weight.device) if module in balance_layers: - module.weight.mul_(scales.view(1, -1).to(module.weight.device)) + update_offload_parameter( + module, + "weight", + module.weight.mul_(scales.view(1, -1)), + ) elif module == smooth_layer: if module.weight.ndim == 1: update_offload_parameter( module, "weight", - module.weight.div(scales.to(module.weight.device)), + module.weight.div(scales), ) else: + # https://github.com/casper-hansen/AutoAWQ/blob/main/awq/quantize/scale.py#L123 + # TODO fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1)) + weight = module.weight + weight[-scales.size(0) :].div_(scales.view(-1, 1)) update_offload_parameter( module, "weight", - module.weight.div( - scales.view(-1, 1).to(module.weight.device) - ), + weight, + # module.weight.div( + # scales.view(-1, 1).to(module.weight.device) + # ), ) if hasattr(module, "bias") and module.bias is not None: update_offload_parameter( module, "bias", - module.bias.div(scales.to(module.bias.device)), + module.bias.div(scales), ) parent = get_fsdp_parent(mapping.smooth_name, model) From 35a03993a67361a33736a96e1ca6d2e13915a332 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Fri, 16 May 2025 22:35:46 +0000 Subject: [PATCH 2/5] more to do Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index f9593fe304..f6705c7dca 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -488,6 +488,7 @@ def smooth(module): # TODO fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1)) weight = module.weight weight[-scales.size(0) :].div_(scales.view(-1, 1)) + # weight.transpose(1, 0).div_(scales.view(-1, 1)) update_offload_parameter( module, "weight", From 58a715d621f9e94fc3616a67afddf725c46e3636 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 19 May 2025 20:25:15 +0000 Subject: [PATCH 3/5] only scale subset with valid shape Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index f6705c7dca..efc5488f6a 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -481,27 +481,30 @@ def smooth(module): update_offload_parameter( module, "weight", - module.weight.div(scales), + module.weight.div_(scales), ) else: + # NOTE: edge case when smooth layer number of out_features + # is not equal to balance layer number of in_features + # e.g. when fused qkv_proj is used to smooth o_proj + # in this case, default to scaling the last output features + # because the desired smooth layer is v_proj # https://github.com/casper-hansen/AutoAWQ/blob/main/awq/quantize/scale.py#L123 - # TODO fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1)) weight = module.weight - weight[-scales.size(0) :].div_(scales.view(-1, 1)) - # weight.transpose(1, 0).div_(scales.view(-1, 1)) + if module.out_features == scales.numel(): + weight.div_(scales.view(-1, 1)) + else: + weight[-scales.size(0) :].div_(scales.view(-1, 1)) update_offload_parameter( module, "weight", weight, - # module.weight.div( - # scales.view(-1, 1).to(module.weight.device) - # ), ) if hasattr(module, "bias") and module.bias is not None: update_offload_parameter( module, "bias", - module.bias.div(scales), + module.bias.div_(scales), ) parent = get_fsdp_parent(mapping.smooth_name, model) From 876bed0bf8679ff6ec4b4b8ae9241eb756446450 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Tue, 20 May 2025 18:44:32 +0000 Subject: [PATCH 4/5] update to skip qkv_proj->o_proj mappings on shape condition Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 28 ++++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index efc5488f6a..612670d472 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -311,13 +311,24 @@ def _set_resolved_mappings(self, model: Module) -> None: continue # exclude v_proj/o_proj mappings whose shapes are incompatible + # exclude fused qkv_proj/o_proj mappings when shapes are not all equal # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777 if ( - ".v_proj" in layer_name - and ".o_proj" in balance_name - and isinstance(smooth_layer, torch.nn.Linear) + isinstance(smooth_layer, torch.nn.Linear) and isinstance(balance_layer, torch.nn.Linear) - and smooth_layer.weight.shape != balance_layer.weight.shape + and ".o_proj" in balance_name + and ( + ( + ".v_proj" in layer_name + and smooth_layer.out_features + != balance_layer.in_features + ) + or ( + ".qkv_proj" in layer_name + and smooth_layer.out_features + != 3 * balance_layer.in_features + ) + ) ): num_skipped_oproj_mappings += 1 continue @@ -490,15 +501,12 @@ def smooth(module): # in this case, default to scaling the last output features # because the desired smooth layer is v_proj # https://github.com/casper-hansen/AutoAWQ/blob/main/awq/quantize/scale.py#L123 - weight = module.weight - if module.out_features == scales.numel(): - weight.div_(scales.view(-1, 1)) - else: - weight[-scales.size(0) :].div_(scales.view(-1, 1)) update_offload_parameter( module, "weight", - weight, + module.weight[-scales.size(0) :].div_( + scales.view(-1, 1) + ), ) if hasattr(module, "bias") and module.bias is not None: update_offload_parameter( From 5f06c067e5277b97f710246bb9a962b0250d19b3 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Tue, 20 May 2025 18:59:13 +0000 Subject: [PATCH 5/5] style fixes Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 612670d472..49fc491fc5 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -310,8 +310,7 @@ def _set_resolved_mappings(self, model: Module) -> None: if not balance_layer: continue - # exclude v_proj/o_proj mappings whose shapes are incompatible - # exclude fused qkv_proj/o_proj mappings when shapes are not all equal + # exclude v_proj->o_proj mappings whose shapes are incompatible # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777 if ( isinstance(smooth_layer, torch.nn.Linear)