From 4ec91c1ccb0816a2145d7b6bda34d34fef02d38d Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 29 May 2025 16:10:07 +0000 Subject: [PATCH 01/13] squashed/rebased Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 149 +++++++++++--------- src/llmcompressor/modifiers/awq/mappings.py | 1 + 2 files changed, 83 insertions(+), 67 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index d7decaf374..afb9097792 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -307,77 +307,92 @@ def _set_resolved_mappings(self, model: Module) -> None: repeat for model.layer.1 and so on """ resolved_mappings: list[ResolvedMapping] = [] - num_skipped_oproj_mappings = 0 - for mapping in self.mappings: - to_smooth_layers = get_layers(mapping.smooth_layer, model) - for layer_name, smooth_layer in to_smooth_layers.items(): - # always exclude `.weight_observer`, only want `.weight` - if layer_name not in self.ignore and not layer_name.endswith( - "_observer" - ): - balance_layers, balance_names = [], [] - for balance_suffix in mapping.balance_layers: - # find the submodule that matches the activation layer - balance_name, balance_layer = get_matching_layer( - balance_suffix, layer_name, model - ) - if not balance_layer: - continue - - # exclude v_proj->o_proj mappings whose shapes are incompatible - # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777 - if ( - isinstance(smooth_layer, torch.nn.Linear) - and isinstance(balance_layer, torch.nn.Linear) - and ".o_proj" in balance_name - and ( - ( - ".v_proj" in layer_name - and smooth_layer.out_features - != balance_layer.in_features - ) - or ( - ".qkv_proj" in layer_name - and smooth_layer.out_features - != 3 * balance_layer.in_features - ) - ) - ): - num_skipped_oproj_mappings += 1 - continue - - balance_layers.append(balance_layer) - balance_names.append(balance_name) + for mapping_idx, mapping in enumerate(self.mappings): + smooth_layers = get_layers(mapping.smooth_layer, model) + smooth_names = [ + smooth_name + for smooth_name in smooth_layers + if ( + smooth_name not in self.ignore + and not smooth_name.endswith("_observer") + ) + ] + + num_skipped_mappings = 0 + pbar = tqdm(smooth_names) + for smooth_name in pbar: + pbar.set_description( + f"Resolving mapping {mapping_idx+1}/{len(self.mappings)}" + f" ({num_skipped_mappings} skipped)" + ) + smooth_layer = smooth_layers[smooth_name] - if len(balance_layers) == 0: + balance_layers, balance_names = [], [] + for balance_suffix in mapping.balance_layers: + # find the submodule that matches the activation layer + parent_name, parent_module = get_parent_by_name( + layer_name=smooth_name, model=model + ) + balance_name, balance_layer = get_matching_layer( + balance_suffix, + smooth_name, + parent_module, + ) + if not balance_layer: continue - - # each mapping can contain multiple layers to balance, but only - # one layer to smooth - if len(balance_layers) == 1: - # for single balance layer, parent is the balance layer - parent_name, parent = balance_name, balance_layer - else: - # for multiple balance layers, - # parent of any balance layer is the parent - parent_name, parent = get_parent_by_name( - layer_name=balance_name, model=model - ) - resolved_mappings.append( - ResolvedMapping( - layer_name, - smooth_layer, - balance_layers, - balance_names=balance_names, - parent=parent, - parent_name=parent_name, + balance_name = f"{parent_name}.{balance_name}" + + # exclude v_proj->o_proj mappings whose shapes are incompatible + # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777 + if ( + ".v_proj" in smooth_name + and ".o_proj" in balance_name + and isinstance(smooth_layer, torch.nn.Linear) + and isinstance(balance_layer, torch.nn.Linear) + and ".o_proj" in balance_name + and ( + ( + ".v_proj" in smooth_name + and smooth_layer.out_features + != balance_layer.in_features + ) + or ( + ".qkv_proj" in smooth_name + and smooth_layer.out_features + != 3 * balance_layer.in_features + ) ) + ): + num_skipped_mappings += 1 + continue + + balance_layers.append(balance_layer) + balance_names.append(balance_name) + + if len(balance_layers) == 0: + continue + + # each mapping can contain multiple layers to balance, but only + # one layer to smooth + elif len(balance_layers) == 1: + # for single balance layer, parent is the balance layer + parent_name, parent = balance_name, balance_layer + else: + # for multiple balance layers, + # parent of any balance layer is the parent + parent_name, parent = get_parent_by_name( + layer_name=balance_name, model=model ) - if num_skipped_oproj_mappings > 0: - logger.info( - f"Excluded {num_skipped_oproj_mappings} from resolved " - "mappings due to shape mismatch" - ) + resolved_mappings.append( + ResolvedMapping( + smooth_name, + smooth_layer, + balance_layers, + balance_names=balance_names, + parent=parent, + parent_name=parent_name, + ) + ) self._resolved_mappings = resolved_mappings return diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index 700525ed8b..a981c7e44c 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -62,6 +62,7 @@ class AWQMapping: "LlamaForCausalLM": _default_mappings, "Qwen2ForCausalLM": _default_mappings, "Qwen3ForCausalLM": _default_mappings, + "Qwen3MoeForCausalLM": _default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings, "Phi3VForCausalLM": _phi_mappings, From 8dc118ff9f39a4f7a43891852a85e75980914430 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 29 May 2025 17:11:40 +0000 Subject: [PATCH 02/13] test fixes Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/mappings.py | 28 ++++++------- .../llmcompressor/modifiers/awq/test_base.py | 40 +++++++++++-------- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index a981c7e44c..3f2e16818f 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -25,17 +25,17 @@ class AWQMapping: _default_mappings = [ AWQMapping( - "re:.*input_layernorm", - ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], + "re:.*input_layernorm$", + ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], ), - AWQMapping("re:.*v_proj", ["re:.*o_proj"]), + AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), AWQMapping( - "re:.*post_attention_layernorm", - ["re:.*gate_proj", "re:.*up_proj"], + "re:.*post_attention_layernorm$", + ["re:.*gate_proj$", "re:.*up_proj$"], ), AWQMapping( - "re:.*up_proj", - ["re:.*down_proj"], + "re:.*up_proj$", + ["re:.*down_proj$"], ), ] @@ -44,17 +44,17 @@ class AWQMapping: # gate and up proj layers into a single gate_up_proj layer _phi_mappings = [ AWQMapping( - "re:.*input_layernorm", - ["re:.*qkv_proj"], + "re:.*input_layernorm$", + ["re:.*qkv_proj$"], ), - AWQMapping("re:.*qkv_proj", ["re:.*o_proj"]), + AWQMapping("re:.*qkv_proj$", ["re:.*o_proj$"]), AWQMapping( - "re:.*post_attention_layernorm", - ["re:.*gate_up_proj"], + "re:.*post_attention_layernorm$", + ["re:.*gate_up_proj$"], ), AWQMapping( - "re:.*gate_up_proj", - ["re:.*down_proj"], + "re:.*gate_up_proj$", + ["re:.*down_proj$"], ), ] diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py index ae38da08ca..035519b5aa 100644 --- a/tests/llmcompressor/modifiers/awq/test_base.py +++ b/tests/llmcompressor/modifiers/awq/test_base.py @@ -56,32 +56,36 @@ def test_set_resolved_mappings(): ) model = torch.nn.ModuleDict( { - "self_attn": self_attn, - "input_layernorm": torch.nn.LayerNorm(4), - "mlp": mlp, + "decoder": torch.nn.ModuleDict( + { + "self_attn": self_attn, + "input_layernorm": torch.nn.LayerNorm(4), + "mlp": mlp, + } + ) } ) awq._set_resolved_mappings(model) for mapping in awq._resolved_mappings: if "input_layernorm" in mapping.smooth_name: assert set(mapping.balance_names) == { - "self_attn.q_proj", - "self_attn.k_proj", - "self_attn.v_proj", + "decoder.self_attn.q_proj", + "decoder.self_attn.k_proj", + "decoder.self_attn.v_proj", } assert set(mapping.balance_layers) == { self_attn.q_proj, self_attn.k_proj, self_attn.v_proj, } - assert mapping.parent_name == "self_attn" + assert mapping.parent_name == "decoder.self_attn" assert mapping.parent == self_attn if "self_attn.v_proj" in mapping.smooth_name: - assert set(mapping.balance_names) == {"self_attn.o_proj"} - assert mapping.parent_name == "self_attn.o_proj" + assert set(mapping.balance_names) == {"decoder.self_attn.o_proj"} + assert mapping.parent_name == "decoder.self_attn.o_proj" if "mlp.up_proj" in mapping.smooth_name: - assert set(mapping.balance_names) == {"mlp.down_proj"} - assert mapping.parent_name == "mlp.down_proj" + assert set(mapping.balance_names) == {"decoder.mlp.down_proj"} + assert mapping.parent_name == "decoder.mlp.down_proj" # make sure we exclude case where o_proj/v_proj shapes are mismatched awq = AWQModifier( @@ -92,12 +96,16 @@ def test_set_resolved_mappings(): ) model = torch.nn.ModuleDict( { - "self_attn": torch.nn.ModuleDict( + "decoder": torch.nn.ModuleDict( { - "q_proj": torch.nn.Linear(4, 2), - "k_proj": torch.nn.Linear(4, 2), - "v_proj": torch.nn.Linear(4, 2), - "o_proj": torch.nn.Linear(4, 4), + "self_attn": torch.nn.ModuleDict( + { + "q_proj": torch.nn.Linear(4, 2), + "k_proj": torch.nn.Linear(4, 2), + "v_proj": torch.nn.Linear(4, 2), + "o_proj": torch.nn.Linear(4, 4), + } + ) } ) } From c001953b144ac7275ac38102a02e0b349a1ece19 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 29 May 2025 23:22:13 +0000 Subject: [PATCH 03/13] fast resolution, still failing on Qwen 3 MoE Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 44 +++++++++++++---------- src/llmcompressor/utils/pytorch/module.py | 36 ++++++------------- 2 files changed, 35 insertions(+), 45 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index afb9097792..c99febeba7 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -29,7 +29,7 @@ from llmcompressor.utils.pytorch.module import ( get_layers, get_matching_layer, - get_parent_by_name, + get_layer_by_name, ) __all__ = ["AWQModifier"] @@ -330,9 +330,8 @@ def _set_resolved_mappings(self, model: Module) -> None: balance_layers, balance_names = [], [] for balance_suffix in mapping.balance_layers: # find the submodule that matches the activation layer - parent_name, parent_module = get_parent_by_name( - layer_name=smooth_name, model=model - ) + parent_name = ".".join(smooth_name.split(".")[:-1]) + parent_module = get_layer_by_name(parent_name, model) balance_name, balance_layer = get_matching_layer( balance_suffix, smooth_name, @@ -380,9 +379,9 @@ def _set_resolved_mappings(self, model: Module) -> None: else: # for multiple balance layers, # parent of any balance layer is the parent - parent_name, parent = get_parent_by_name( - layer_name=balance_name, model=model - ) + parent_name = ".".join(balance_name.split(".")[:-1]) + parent = get_layer_by_name(parent_name, model) + resolved_mappings.append( ResolvedMapping( smooth_name, @@ -413,14 +412,15 @@ def cache_parent_kwargs_hook( def create_cache_smooth_activations_hook_fn(smooth_name): def cache_smooth_activations_hook( _module: torch.nn.Module, - args: Tuple[torch.Tensor, ...], + args: Union[torch.Tensor, Tuple[torch.Tensor, ...]], _output: torch.Tensor, ): - # Assume that first argument is the input - inp = args[0].cpu().detach().squeeze() + if not isinstance(args, Tuple): + print(f"GOT unexpected args {args}") + inp = args[0] if isinstance(args, Tuple) else args self._smooth_activation_means[smooth_name] = _accumulate_mean( - inp, + inp.cpu().detach().squeeze(), self._smooth_activation_means.get(smooth_name, None), ) @@ -459,12 +459,14 @@ def _apply_smoothing(self, model: Module) -> None: :param model: model to apply smoothing to """ - for mapping in tqdm(self._resolved_mappings, desc="Smoothing"): - # NOTE: When using SequentialPipeline, not all the mappings - # will have cached activations in the segment being udpated - if mapping.smooth_name not in self._smooth_activation_means: - continue - + # NOTE: When using SequentialPipeline, not all the mappings + # will have cached activations in the segment being udpated + mappings_to_smooth = [ + mapping + for mapping in self._resolved_mappings + if mapping.smooth_name in self._smooth_activation_means + ] + for mapping in tqdm(mappings_to_smooth, desc="Smoothing"): smooth_layer = mapping.smooth_layer balance_layers = mapping.balance_layers parent_module = mapping.parent @@ -551,10 +553,14 @@ def smooth(module): def _run_samples(self, module: Module) -> torch.Tensor: with align_module_device(module): + outputs = [ + module(**batch_kwargs) + for batch_kwargs in self._parent_args_cache[module] + ] return torch.cat( [ - module(**batch_kwargs)[0] - for batch_kwargs in self._parent_args_cache[module] + output[0] if isinstance(output, Tuple) else output + for output in outputs ], dim=0, ) diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py index 1bb3e3f701..61480478ca 100644 --- a/src/llmcompressor/utils/pytorch/module.py +++ b/src/llmcompressor/utils/pytorch/module.py @@ -4,6 +4,7 @@ import difflib import re +from functools import reduce from typing import Dict, List, Optional, Tuple, Union import torch @@ -53,7 +54,6 @@ "set_layer", "get_params", "get_param", - "set_param", "get_terminal_layers", "get_prunable_layers", "get_quantizable_layers", @@ -61,7 +61,7 @@ "get_layers_params", "get_matching_layer", "get_no_split_params", - "get_parent_by_name", + "get_layer_by_name", ] @@ -208,15 +208,6 @@ def get_param(target: str, module: Module) -> Tuple[str, Parameter]: return name, param -def set_param(target: str, param: Parameter, module: Module) -> Parameter: - layer_name, param_name = target.rsplit(".", 1) - layer = get_layer(layer_name, module)[1] - old_param = getattr(layer, param_name) - setattr(layer, param_name, param) - - return old_param - - def get_terminal_layers(module: Module) -> Dict[str, Module]: terminal = {} @@ -344,20 +335,13 @@ def get_no_split_params(model: PreTrainedModel) -> Union[str, List[str]]: return no_split_modules -def get_parent_by_name(layer_name: str, model: Module) -> Tuple[str, Module]: +# https://discuss.pytorch.org/t/how-to-access-to-a-layer-by-module-name/83797/8 +def get_layer_by_name(layer_name: str, module: Module) -> Module: """ - Get the parent layer of a layer by name. - :param layer_name: Name of the layer to find the parent of. - :param model: Model to search for the parent layer. - :return: Tuple containing the name of the parent layer - and the parent layer itself. + Get the layer of a module by name. + :param layer_name: Name of the layer to find. + :param module: Module in which to search for layer_name + :return: Module, the layer with name layer_name """ - if not any(layer_name == name for name, _ in model.named_modules()): - raise ValueError(f"Layer '{layer_name}' not found in model") - - parent_name_parts = layer_name.split(".")[:-1] - if not parent_name_parts: - return "", model - - parent_name = ".".join(parent_name_parts) - return get_layer(parent_name, model) + names = layer_name.split(sep=".") + return reduce(getattr, names, module) From 1eeac0cad1f21768ceba2429d32a1d5b8e081ae6 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 29 May 2025 23:24:06 +0000 Subject: [PATCH 04/13] stylefixes Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 2 +- .../utils/pytorch/test_module.py | 32 +++++++------------ 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index c99febeba7..f65ec80599 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -27,9 +27,9 @@ from llmcompressor.utils.fsdp.helpers import get_fsdp_parent from llmcompressor.utils.helpers import calibration_forward_context from llmcompressor.utils.pytorch.module import ( + get_layer_by_name, get_layers, get_matching_layer, - get_layer_by_name, ) __all__ = ["AWQModifier"] diff --git a/tests/llmcompressor/utils/pytorch/test_module.py b/tests/llmcompressor/utils/pytorch/test_module.py index 22763aba90..1ab40aa159 100644 --- a/tests/llmcompressor/utils/pytorch/test_module.py +++ b/tests/llmcompressor/utils/pytorch/test_module.py @@ -1,7 +1,7 @@ import pytest import torch.nn as nn -from llmcompressor.utils.pytorch import get_parent_by_name +from llmcompressor.utils.pytorch import get_layer_by_name @pytest.fixture @@ -15,28 +15,20 @@ def example_nested_module() -> str: @pytest.mark.unit -def test_get_parent_by_name(example_nested_module): - # Test getting the parent of the first layer - name, parent = get_parent_by_name("0", example_nested_module) - assert parent == example_nested_module - +def test_get_layer_by_name(example_nested_module): # Test getting the parent of a nested layer - name, parent = get_parent_by_name("1.0", example_nested_module) - assert parent == example_nested_module[1] - assert name == "1" + layer = get_layer_by_name("0", example_nested_module) + assert layer == example_nested_module[0] - name, parent = get_parent_by_name("1.1", example_nested_module) - assert parent == example_nested_module[1] - assert name == "1" + layer = get_layer_by_name("1.1", example_nested_module) + assert layer == example_nested_module[1][1] - name, parent = get_parent_by_name("2.0", example_nested_module) - assert parent == example_nested_module[2] - assert name == "2" + layer = get_layer_by_name("2.0", example_nested_module) + assert layer == example_nested_module[2][0] - name, parent = get_parent_by_name("2.1", example_nested_module) - assert parent == example_nested_module[2] - assert name == "2" + layer = get_layer_by_name("2.1", example_nested_module) + assert layer == example_nested_module[2][1] # Test getting the parent of a non-existent layer - with pytest.raises(ValueError): - get_parent_by_name("non_existent_layer", example_nested_module) + with pytest.raises(AttributeError): + get_layer_by_name("non_existent_layer", example_nested_module) From a2946462ce169fdd22cd7965d7e2ce6f26d69ed6 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 2 Jun 2025 15:25:05 +0000 Subject: [PATCH 05/13] skip if no activations Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index f65ec80599..8a83ab3237 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -412,15 +412,11 @@ def cache_parent_kwargs_hook( def create_cache_smooth_activations_hook_fn(smooth_name): def cache_smooth_activations_hook( _module: torch.nn.Module, - args: Union[torch.Tensor, Tuple[torch.Tensor, ...]], + args: Tuple[torch.Tensor, ...], _output: torch.Tensor, ): - if not isinstance(args, Tuple): - print(f"GOT unexpected args {args}") - inp = args[0] if isinstance(args, Tuple) else args - self._smooth_activation_means[smooth_name] = _accumulate_mean( - inp.cpu().detach().squeeze(), + args[0].cpu().detach().squeeze(), self._smooth_activation_means.get(smooth_name, None), ) @@ -490,10 +486,14 @@ def _apply_smoothing(self, model: Module) -> None: # [STEP 3]: Compute output of module # could cache from hook, rather than recomputing here fp16_output = self._run_samples(parent_module) - fp16_output = fp16_output.clip( - torch.finfo(fp16_output.dtype).min, - torch.finfo(fp16_output.dtype).max, - ) + if fp16_output.shape[0] == 0: + breakpoint() + logger.info( + f"Skipping smooth_layer {mapping.smooth_name}, no activations " + "found to scale. This occurs in MoE models when calibration " + "samples don't activate certain experts." + ) + x_mean = self._smooth_activation_means[mapping.smooth_name][0] # [STEP 4]: Compute loss From 665e5548baf76cc106dc9b3cb65f886cc8fdf5e3 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 2 Jun 2025 21:44:56 +0000 Subject: [PATCH 06/13] working with Qwen MoE Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 106 +++++++++++--------- src/llmcompressor/modifiers/awq/mappings.py | 18 +++- 2 files changed, 73 insertions(+), 51 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 8a83ab3237..e4b513704a 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -26,11 +26,7 @@ from llmcompressor.pipelines.cache import IntermediatesCache from llmcompressor.utils.fsdp.helpers import get_fsdp_parent from llmcompressor.utils.helpers import calibration_forward_context -from llmcompressor.utils.pytorch.module import ( - get_layer_by_name, - get_layers, - get_matching_layer, -) +from llmcompressor.utils.pytorch.module import get_layer_by_name, get_layers __all__ = ["AWQModifier"] @@ -327,59 +323,54 @@ def _set_resolved_mappings(self, model: Module) -> None: ) smooth_layer = smooth_layers[smooth_name] + smooth_parent_name = ".".join(smooth_name.split(".")[:-1]) + smooth_parent = get_layer_by_name(smooth_parent_name, model) + balance_layers, balance_names = [], [] - for balance_suffix in mapping.balance_layers: - # find the submodule that matches the activation layer - parent_name = ".".join(smooth_name.split(".")[:-1]) - parent_module = get_layer_by_name(parent_name, model) - balance_name, balance_layer = get_matching_layer( - balance_suffix, - smooth_name, - parent_module, - ) - if not balance_layer: - continue - balance_name = f"{parent_name}.{balance_name}" - - # exclude v_proj->o_proj mappings whose shapes are incompatible - # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777 - if ( - ".v_proj" in smooth_name - and ".o_proj" in balance_name - and isinstance(smooth_layer, torch.nn.Linear) - and isinstance(balance_layer, torch.nn.Linear) - and ".o_proj" in balance_name - and ( - ( - ".v_proj" in smooth_name - and smooth_layer.out_features - != balance_layer.in_features - ) - or ( - ".qkv_proj" in smooth_name - and smooth_layer.out_features - != 3 * balance_layer.in_features + for balance_regex in mapping.balance_layers: + # find the submodules that match the activation layer + for balance_suffix, balance_layer in get_layers( + balance_regex, + smooth_parent, + ).items(): + balance_name = f"{smooth_parent_name}.{balance_suffix}" + + # exclude v_proj->o_proj mappings whose shapes are incompatible + # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777 + if ( + ".v_proj" in smooth_name + and ".o_proj" in balance_name + and isinstance(smooth_layer, torch.nn.Linear) + and isinstance(balance_layer, torch.nn.Linear) + and ".o_proj" in balance_name + and ( + ( + ".v_proj" in smooth_name + and smooth_layer.out_features + != balance_layer.in_features + ) + or ( + ".qkv_proj" in smooth_name + and smooth_layer.out_features + != 3 * balance_layer.in_features + ) ) - ) - ): - num_skipped_mappings += 1 - continue + ): + num_skipped_mappings += 1 + continue - balance_layers.append(balance_layer) - balance_names.append(balance_name) + balance_layers.append(balance_layer) + balance_names.append(balance_name) if len(balance_layers) == 0: continue - # each mapping can contain multiple layers to balance, but only - # one layer to smooth elif len(balance_layers) == 1: # for single balance layer, parent is the balance layer parent_name, parent = balance_name, balance_layer else: - # for multiple balance layers, - # parent of any balance layer is the parent - parent_name = ".".join(balance_name.split(".")[:-1]) + # for multiple balance layers, find lowest common parent + parent_name = get_lowest_common_parent(balance_names) parent = get_layer_by_name(parent_name, model) resolved_mappings.append( @@ -487,12 +478,13 @@ def _apply_smoothing(self, model: Module) -> None: # could cache from hook, rather than recomputing here fp16_output = self._run_samples(parent_module) if fp16_output.shape[0] == 0: - breakpoint() logger.info( f"Skipping smooth_layer {mapping.smooth_name}, no activations " - "found to scale. This occurs in MoE models when calibration " - "samples don't activate certain experts." + "found to scale. This can occasionally occur in MoE models " + "when certain experts are not activated by calibration samples." ) + del self._smooth_activation_means[mapping.smooth_name] + continue x_mean = self._smooth_activation_means[mapping.smooth_name][0] @@ -757,3 +749,17 @@ def _accumulate_mean( new_count = prev_count + num_added return (prev_sum + sum_added) / new_count, new_count + + +def get_lowest_common_parent(names: List[str]) -> str: + """ + Given a list of names, returns the lowest-scope common parent + Slight alteration from os.path.commonprefix + https://docs.python.org/3/library/os.path.html#os.path.commonprefix + """ + s1 = min(names) + s2 = max(names) + for i, c in enumerate(s1): + if c != s2[i]: + return s1[:i].rstrip(".") + return s1 diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index 3f2e16818f..2d5427e1cd 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -39,6 +39,22 @@ class AWQMapping: ), ] +_moe_default_mappings = [ + AWQMapping( + "re:.*input_layernorm$", + ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], + ), + AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), + AWQMapping( + "re:.*post_attention_layernorm$", + ["re:.*mlp.experts.*.gate_proj$", "re:.*mlp.experts.*.up_proj$"], + ), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), +] + # Phi merges # q, k, and v proj layers into a single qkv_proj layer # gate and up proj layers into a single gate_up_proj layer @@ -62,7 +78,7 @@ class AWQMapping: "LlamaForCausalLM": _default_mappings, "Qwen2ForCausalLM": _default_mappings, "Qwen3ForCausalLM": _default_mappings, - "Qwen3MoeForCausalLM": _default_mappings, + "Qwen3MoeForCausalLM": _moe_default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings, "Phi3VForCausalLM": _phi_mappings, From 4976ac3f255f14cfaf4110c9a2808cd232d21a6c Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 2 Jun 2025 22:30:59 +0000 Subject: [PATCH 07/13] update get_lowest_common_parent Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index e4b513704a..22d33027da 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -370,8 +370,7 @@ def _set_resolved_mappings(self, model: Module) -> None: parent_name, parent = balance_name, balance_layer else: # for multiple balance layers, find lowest common parent - parent_name = get_lowest_common_parent(balance_names) - parent = get_layer_by_name(parent_name, model) + parent_name, parent = get_lowest_common_parent(balance_names, model) resolved_mappings.append( ResolvedMapping( @@ -751,15 +750,24 @@ def _accumulate_mean( return (prev_sum + sum_added) / new_count, new_count -def get_lowest_common_parent(names: List[str]) -> str: +def get_lowest_common_parent(names: List[str], module: Module) -> str: """ - Given a list of names, returns the lowest-scope common parent + Given a list of names, returns the lowest-scope common parent, + excluding parents of type ModuleList, which don't seem to play + nicely with hooks. Slight alteration from os.path.commonprefix https://docs.python.org/3/library/os.path.html#os.path.commonprefix """ s1 = min(names) s2 = max(names) + parent_name = s1 for i, c in enumerate(s1): if c != s2[i]: - return s1[:i].rstrip(".") - return s1 + parent_name = s1[:i].rstrip(".") + break + + while parent_name != "": + parent = get_layer_by_name(parent_name, module) + if not isinstance(parent, torch.nn.ModuleList): + return parent_name, parent + parent_name = ".".join(parent_name.split(".")[:-1]) From 9ab1ca18dcbf0a251c73de3229de32d25f088acd Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 2 Jun 2025 22:34:55 +0000 Subject: [PATCH 08/13] mappings reorg Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/mappings.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index 2d5427e1cd..6390445c8c 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -76,12 +76,13 @@ class AWQMapping: AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = { "LlamaForCausalLM": _default_mappings, - "Qwen2ForCausalLM": _default_mappings, - "Qwen3ForCausalLM": _default_mappings, - "Qwen3MoeForCausalLM": _moe_default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings, "Phi3VForCausalLM": _phi_mappings, + "Qwen2ForCausalLM": _default_mappings, + "Qwen2MoeForCausalLM": _moe_default_mappings, + "Qwen3ForCausalLM": _default_mappings, + "Qwen3MoeForCausalLM": _moe_default_mappings, } From 3f364fe3056a8c3f918634d0083767e30afa34ef Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Tue, 3 Jun 2025 16:33:36 +0000 Subject: [PATCH 09/13] include awq Qwen MoE example Signed-off-by: Brian Dellabetta --- examples/awq/qwen3_moe_example.py | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 examples/awq/qwen3_moe_example.py diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py new file mode 100644 index 0000000000..b8f4a4ec18 --- /dev/null +++ b/examples/awq/qwen3_moe_example.py @@ -0,0 +1,82 @@ +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.awq import AWQModifier + +# Select model and load it. +MODEL_ID = "Qwen/Qwen3-30B-A3B" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) + +# Select calibration dataset. +DATASET_ID = "mit-han-lab/pile-val-backup" +DATASET_SPLIT = "validation" + +# Select number of samples. 256 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 256 +MAX_SEQUENCE_LENGTH = 512 + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + [{"role": "user", "content": example["text"]}], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +# Configure the quantization algorithm to run. +# NOTE: vllm currently does not support asym MoE, using symmetric here +recipe = [ + AWQModifier( + ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], + scheme="W4A16", + targets=["Linear"], + ), +] + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) From 2eac1c3527911a5bd3896c03b74436d364b81962 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Tue, 3 Jun 2025 17:01:17 +0000 Subject: [PATCH 10/13] cleanup Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 22d33027da..eb0b22e353 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -338,9 +338,7 @@ def _set_resolved_mappings(self, model: Module) -> None: # exclude v_proj->o_proj mappings whose shapes are incompatible # https://github.com/mit-han-lab/llm-awq/pull/67#issuecomment-1681632777 if ( - ".v_proj" in smooth_name - and ".o_proj" in balance_name - and isinstance(smooth_layer, torch.nn.Linear) + isinstance(smooth_layer, torch.nn.Linear) and isinstance(balance_layer, torch.nn.Linear) and ".o_proj" in balance_name and ( @@ -406,6 +404,7 @@ def cache_smooth_activations_hook( _output: torch.Tensor, ): self._smooth_activation_means[smooth_name] = _accumulate_mean( + # Assume that first argument is the input args[0].cpu().detach().squeeze(), self._smooth_activation_means.get(smooth_name, None), ) From e31074f7b2f78c6928f074c19001b75da3876752 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Tue, 3 Jun 2025 17:19:27 +0000 Subject: [PATCH 11/13] unit tests Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 8 +-- .../llmcompressor/modifiers/awq/test_base.py | 53 +++++++++++++++++++ 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index eb0b22e353..96c8e6a08a 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -749,17 +749,19 @@ def _accumulate_mean( return (prev_sum + sum_added) / new_count, new_count -def get_lowest_common_parent(names: List[str], module: Module) -> str: +def get_lowest_common_parent(names: List[str], module: Module) -> Tuple[str, Module]: """ Given a list of names, returns the lowest-scope common parent, excluding parents of type ModuleList, which don't seem to play nicely with hooks. - Slight alteration from os.path.commonprefix + Returns name of parent and pointer to parent module + + Implementation is a small alteration of os.path.commonprefix https://docs.python.org/3/library/os.path.html#os.path.commonprefix """ s1 = min(names) s2 = max(names) - parent_name = s1 + parent_name = "" for i, c in enumerate(s1): if c != s2[i]: parent_name = s1[:i].rstrip(".") diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py index 035519b5aa..35e288845b 100644 --- a/tests/llmcompressor/modifiers/awq/test_base.py +++ b/tests/llmcompressor/modifiers/awq/test_base.py @@ -4,6 +4,7 @@ from pydantic import ValidationError from llmcompressor.modifiers.awq import AWQMapping, AWQModifier +from llmcompressor.modifiers.awq.base import get_lowest_common_parent from llmcompressor.modifiers.factory import ModifierFactory from tests.llmcompressor.modifiers.conf import setup_modifier_factory @@ -172,3 +173,55 @@ def test_validate(): ), } ) + + +@pytest.mark.unit +def test_get_lowest_common_parent(): + mlp = torch.nn.ModuleDict( + { + "experts": torch.nn.ModuleList( + [ + torch.nn.ModuleDict( + { + "gate_proj": torch.nn.Linear(4, 2), + "down_proj": torch.nn.Linear(4, 2), + } + ) + for _ in range(10) + ] + ) + } + ) + self_attn = torch.nn.ModuleDict( + { + "q_proj": torch.nn.Linear(4, 2), + "k_proj": torch.nn.Linear(4, 2), + "v_proj": torch.nn.Linear(4, 2), + "o_proj": torch.nn.Linear(4, 4), + } + ) + model = torch.nn.ModuleDict( + { + "decoder": torch.nn.ModuleDict( + { + "self_attn": self_attn, + "mlp": mlp, + } + ) + } + ) + + parent_name, parent = get_lowest_common_parent( + ["decoder.mlp.experts.1.gate_proj", "decoder.mlp.experts.4.down_proj"], model + ) + assert parent_name == "decoder.mlp" and parent == mlp + + parent_name, parent = get_lowest_common_parent( + ["decoder.self_attn.q_proj", "decoder.self_attn.v_proj"], model + ) + assert parent_name == "decoder.self_attn" and parent == self_attn + + parent_name, parent = get_lowest_common_parent( + ["decoder.mlp.experts.1.gate_proj", "decoder.self_attn.v_proj"], model + ) + assert parent_name == "decoder" and parent == model["decoder"] From 75a1602fef9eabad84b14a522bbe4ec0b2eee69f Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Tue, 3 Jun 2025 17:22:36 +0000 Subject: [PATCH 12/13] test updates Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 4 +++- tests/llmcompressor/modifiers/awq/test_base.py | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 96c8e6a08a..1e48f7c76d 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -767,7 +767,9 @@ def get_lowest_common_parent(names: List[str], module: Module) -> Tuple[str, Mod parent_name = s1[:i].rstrip(".") break - while parent_name != "": + while True: + if parent_name == "": + return "", module parent = get_layer_by_name(parent_name, module) if not isinstance(parent, torch.nn.ModuleList): return parent_name, parent diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py index 35e288845b..a4adfbdac0 100644 --- a/tests/llmcompressor/modifiers/awq/test_base.py +++ b/tests/llmcompressor/modifiers/awq/test_base.py @@ -202,12 +202,13 @@ def test_get_lowest_common_parent(): ) model = torch.nn.ModuleDict( { + "embed_tokens": torch.nn.Linear(4, 2), "decoder": torch.nn.ModuleDict( { "self_attn": self_attn, "mlp": mlp, } - ) + ), } ) @@ -225,3 +226,8 @@ def test_get_lowest_common_parent(): ["decoder.mlp.experts.1.gate_proj", "decoder.self_attn.v_proj"], model ) assert parent_name == "decoder" and parent == model["decoder"] + + parent_name, parent = get_lowest_common_parent( + ["embed_tokens", "decoder.self_attn.v_proj"], model + ) + assert parent_name == "" and parent == model From 96d5f5904d68c89d91e319c5374a1ac0880d0470 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 5 Jun 2025 16:52:02 +0000 Subject: [PATCH 13/13] codreview updates Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 30 ++++++++++++++--------- src/llmcompressor/utils/pytorch/module.py | 5 ++-- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 1e48f7c76d..f95aaaea84 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -2,7 +2,10 @@ from typing import Dict, List, Optional, Tuple, Union import torch -from compressed_tensors.quantization import disable_quantization +from compressed_tensors.quantization import ( + disable_quantization, + find_name_or_class_matches, +) from compressed_tensors.utils import ( align_module_device, get_execution_device, @@ -308,9 +311,8 @@ def _set_resolved_mappings(self, model: Module) -> None: smooth_names = [ smooth_name for smooth_name in smooth_layers - if ( - smooth_name not in self.ignore - and not smooth_name.endswith("_observer") + if not find_name_or_class_matches( + smooth_name, model, self.ignore + ["re:.*_observer$"] ) ] @@ -340,15 +342,15 @@ def _set_resolved_mappings(self, model: Module) -> None: if ( isinstance(smooth_layer, torch.nn.Linear) and isinstance(balance_layer, torch.nn.Linear) - and ".o_proj" in balance_name + and balance_name.endswith(".o_proj") and ( ( - ".v_proj" in smooth_name + smooth_name.endswith(".v_proj") and smooth_layer.out_features != balance_layer.in_features ) or ( - ".qkv_proj" in smooth_name + smooth_name.endswith(".qkv_proj") and smooth_layer.out_features != 3 * balance_layer.in_features ) @@ -475,7 +477,7 @@ def _apply_smoothing(self, model: Module) -> None: # [STEP 3]: Compute output of module # could cache from hook, rather than recomputing here fp16_output = self._run_samples(parent_module) - if fp16_output.shape[0] == 0: + if fp16_output.numel() == 0: logger.info( f"Skipping smooth_layer {mapping.smooth_name}, no activations " "found to scale. This can occasionally occur in MoE models " @@ -549,6 +551,7 @@ def _run_samples(self, module: Module) -> torch.Tensor: ] return torch.cat( [ + # If Tuple, assume that first argument is the input output[0] if isinstance(output, Tuple) else output for output in outputs ], @@ -751,9 +754,14 @@ def _accumulate_mean( def get_lowest_common_parent(names: List[str], module: Module) -> Tuple[str, Module]: """ - Given a list of names, returns the lowest-scope common parent, - excluding parents of type ModuleList, which don't seem to play - nicely with hooks. + Given a list of names, returns the lowest-scope common parent. + + NOTE: function excludes parents of type ModuleList, which don't play + nicely with hooks because their forward method is never directly + called for MoE models. See Qwen3MoeSparseMoeBlock for example, experts + are selected based on router output and their forward method is called. + https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py#L233 + Returns name of parent and pointer to parent module Implementation is a small alteration of os.path.commonprefix diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py index 61480478ca..835493fa3d 100644 --- a/src/llmcompressor/utils/pytorch/module.py +++ b/src/llmcompressor/utils/pytorch/module.py @@ -4,7 +4,7 @@ import difflib import re -from functools import reduce +from operator import attrgetter from typing import Dict, List, Optional, Tuple, Union import torch @@ -343,5 +343,4 @@ def get_layer_by_name(layer_name: str, module: Module) -> Module: :param module: Module in which to search for layer_name :return: Module, the layer with name layer_name """ - names = layer_name.split(sep=".") - return reduce(getattr, names, module) + return attrgetter(layer_name)(module)