From 7db5f4efed7f0287396ef956813016f09a3bc7b5 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Wed, 24 Dec 2025 18:12:53 -0600 Subject: [PATCH 01/17] Added GLM Modeling. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- src/llmcompressor/modeling/__init__.py | 1 + src/llmcompressor/modeling/glm4_moe.py | 98 ++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 src/llmcompressor/modeling/glm4_moe.py diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py index ef04896e06..d7cd3f24ae 100644 --- a/src/llmcompressor/modeling/__init__.py +++ b/src/llmcompressor/modeling/__init__.py @@ -11,6 +11,7 @@ # trigger registration from .deepseek_v3 import CalibrationDeepseekV3MoE # noqa: F401 +from .glm4_moe import CalibrationGlm4MoeMoE # noqa: F401 from .llama4 import SequentialLlama4TextMoe # noqa: F401 from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock # noqa: F401 from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock # noqa: F401 diff --git a/src/llmcompressor/modeling/glm4_moe.py b/src/llmcompressor/modeling/glm4_moe.py new file mode 100644 index 0000000000..4e658f1f71 --- /dev/null +++ b/src/llmcompressor/modeling/glm4_moe.py @@ -0,0 +1,98 @@ +import torch +from transformers.models.glm4_moe.configuration_glm4_moe import Glm4MoeConfig +from transformers.models.glm4_moe.modeling_glm4_moe import ( + Glm4MoeMoE as OriginalGlm4MoeMoE, +) + +from llmcompressor.modeling.moe_context import MoECalibrationModule + + +@MoECalibrationModule.register("Glm4MoeMoE") +class CalibrationGlm4MoeMoE(MoECalibrationModule): + """ + Calibration version of Glm4MoeMoE that sends all tokens to all experts. + + During calibration, when calibrate_all_experts=True, all tokens are sent to + all experts to ensure proper quantization statistics are collected for every + expert, not just those activated by the calibration data routing. + """ + + is_permanent = False + + def __init__( + self, + original: OriginalGlm4MoeMoE, + config: Glm4MoeConfig, + calibrate_all_experts: bool = True, + ): + super().__init__() + self.config = config + self.experts = original.experts + self.gate = original.gate + self.shared_experts = original.shared_experts + self.calibrate_all_experts = calibrate_all_experts + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass with optional calibration mode. + + When calibrate_all_experts=True: + - All tokens are sent to all experts for calibration + - Routing weights are still used for final output combination + - This ensures all experts see calibration data + + When calibrate_all_experts=False: + - Normal MoE routing behavior (only routed tokens go to each expert) + """ + residuals = hidden_states + orig_shape = hidden_states.shape + topk_indices, topk_weights = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + # Begin MoE - inline the moe() method logic with calibration support + final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) + expert_mask = torch.nn.functional.one_hot( + topk_indices, num_classes=len(self.experts) + ) + expert_mask = expert_mask.permute(2, 0, 1) + + for expert_idx, expert in enumerate(self.experts): + mask = expert_mask[expert_idx] + token_indices, weight_indices = torch.where(mask) + has_tokens = token_indices.numel() > 0 + + if self.calibrate_all_experts: + # Send all tokens to this expert for calibration + expert_input = hidden_states + expert_output = expert(expert_input) + + if has_tokens: + # Still use routing weights for final output combination + expert_weights = topk_weights[token_indices, weight_indices] + weighted_output = expert_output[ + token_indices + ] * expert_weights.unsqueeze(-1) + final_hidden_states.index_add_(0, token_indices, weighted_output) + else: + # Normal MoE: only process tokens routed to this expert + if has_tokens: + expert_input = hidden_states[token_indices] + expert_output = expert(expert_input) + expert_weights = topk_weights[token_indices, weight_indices] + weighted_output = expert_output * expert_weights.unsqueeze(-1) + final_hidden_states.index_add_(0, token_indices, weighted_output) + # End MoE + + hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape) + hidden_states = hidden_states + self.shared_experts(residuals) + return hidden_states + + def restore(self, original: torch.nn.Module) -> torch.nn.Module: + """ + Restore the original module structure. + + Since is_permanent=False, this method is called when exiting + the calibration context to restore the original MoE module. + """ + return original + From e1e7b5893cdc11f049b3b52d4f2741e5ddb7080d Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Sat, 27 Dec 2025 08:15:24 -0600 Subject: [PATCH 02/17] Adding the test file for GLM-MoE. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- .../modeling/test_calib_glm4_moe.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 tests/llmcompressor/modeling/test_calib_glm4_moe.py diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py new file mode 100644 index 0000000000..042dbd85df --- /dev/null +++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py @@ -0,0 +1,92 @@ +import contextlib +from functools import partial + +import pytest +import torch +from transformers import AutoModelForCausalLM + +from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE +from llmcompressor.modeling.moe_context import moe_calibration_context +from llmcompressor.utils.dev import skip_weights_download +from llmcompressor.utils.helpers import calibration_forward_context +from tests.testing_utils import requires_cadence, requires_gpu + +Glm4MoeConfig = pytest.importorskip( + "transformers.models.glm4_moe.configuration_glm4_moe", + reason="Glm4MoeConfig not available in this version of transformers", +).Glm4MoeConfig +OriginalGlm4MoeMoE = pytest.importorskip( + "transformers.models.glm4_moe.modeling_glm4_moe", + reason="Glm4MoeMoE not available in this version of transformers", +).Glm4MoeMoE + + +@requires_cadence("weekly") +@pytest.mark.parametrize("model_stub", ["THUDM/glm-4-9b-chat"]) # Update with actual GLM4 MoE model stub +def test_calib_replace_glm4moe_all_experts(model_stub): + with skip_weights_download(): + model = AutoModelForCausalLM.from_pretrained(model_stub) + + with contextlib.ExitStack() as stack: + stack.enter_context(calibration_forward_context(model)) + stack.enter_context(moe_calibration_context(model, calibrate_all_experts=True)) + + # Find a GLM4 MoE layer + moe_layer = None + for _, module in model.named_modules(): + if isinstance(module, CalibrationGlm4MoeMoE): + moe_layer = module + break + + assert moe_layer is not None + + num_experts = len(moe_layer.experts) + expert_triggered = [False for _ in range(num_experts)] + + # Define the hook function + def hook_fn(i, module, input, output): + expert_triggered[i] = True + + # Attach hooks using functools.partial to bind each index + for i, expert in enumerate(moe_layer.experts): + expert.register_forward_hook(partial(hook_fn, i)) + + # Create dummy input tensor that simulates hidden_states + hidden_dim = model.config.hidden_size + batch, seq_len = 4, 32 + sample = torch.randn(batch, seq_len, hidden_dim, dtype=torch.float32) + + # Forward through the MoE layer directly + with torch.no_grad(): + _ = moe_layer(sample) + + # Assert all experts are used + assert all( + expert_triggered + ), f"Not all experts were triggered: {expert_triggered}" + + +@requires_gpu +def test_calib_glm4moe_module(): + config = Glm4MoeConfig() + with torch.device("cuda"): + original = OriginalGlm4MoeMoE(config).eval() + + # Create dummy input tensor that simulates hidden_states + hidden_dim = config.hidden_size + batch, seq_len = 4, 32 + sample = torch.randn(batch, seq_len, hidden_dim, device="cuda") + + with calibration_forward_context(original): + true_output = original(sample) + + module = CalibrationGlm4MoeMoE(original, config, calibrate_all_experts=True) + with calibration_forward_context(module): + output = module(sample) + assert torch.allclose(true_output, output, atol=1e-6) + + module = CalibrationGlm4MoeMoE(original, config, calibrate_all_experts=False) + with calibration_forward_context(module): + output = module(sample) + assert torch.allclose(true_output, output, atol=1e-6) + From a7b0e8acf5d0741d4b3be9e44da02e18814f39b4 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Sat, 27 Dec 2025 19:20:55 -0600 Subject: [PATCH 03/17] Adding Trust Remote Code for testing calibration. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- tests/llmcompressor/modeling/test_calib_glm4_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py index 042dbd85df..4f4449b3c2 100644 --- a/tests/llmcompressor/modeling/test_calib_glm4_moe.py +++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py @@ -25,7 +25,7 @@ @pytest.mark.parametrize("model_stub", ["THUDM/glm-4-9b-chat"]) # Update with actual GLM4 MoE model stub def test_calib_replace_glm4moe_all_experts(model_stub): with skip_weights_download(): - model = AutoModelForCausalLM.from_pretrained(model_stub) + model = AutoModelForCausalLM.from_pretrained(model_stub, trust_remote_code=True) with contextlib.ExitStack() as stack: stack.enter_context(calibration_forward_context(model)) From f02b1bef3b721c32366134fd19036c163113e153 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Sat, 27 Dec 2025 21:09:10 -0600 Subject: [PATCH 04/17] Fixing the Loop Gemini identifed. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- src/llmcompressor/modeling/glm4_moe.py | 33 ++++++++++++-------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/llmcompressor/modeling/glm4_moe.py b/src/llmcompressor/modeling/glm4_moe.py index 4e658f1f71..c33dc878b3 100644 --- a/src/llmcompressor/modeling/glm4_moe.py +++ b/src/llmcompressor/modeling/glm4_moe.py @@ -62,25 +62,22 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: has_tokens = token_indices.numel() > 0 if self.calibrate_all_experts: - # Send all tokens to this expert for calibration - expert_input = hidden_states - expert_output = expert(expert_input) - - if has_tokens: - # Still use routing weights for final output combination - expert_weights = topk_weights[token_indices, weight_indices] - weighted_output = expert_output[ - token_indices - ] * expert_weights.unsqueeze(-1) - final_hidden_states.index_add_(0, token_indices, weighted_output) + # When calibrating, run all tokens through the expert to gather stats. + # The output is still calculated using only the routed tokens. + expert_output_full = expert(hidden_states) + if not has_tokens: + continue # No tokens routed to this expert, but stats were gathered. + expert_output = expert_output_full[token_indices] else: - # Normal MoE: only process tokens routed to this expert - if has_tokens: - expert_input = hidden_states[token_indices] - expert_output = expert(expert_input) - expert_weights = topk_weights[token_indices, weight_indices] - weighted_output = expert_output * expert_weights.unsqueeze(-1) - final_hidden_states.index_add_(0, token_indices, weighted_output) + # Standard MoE behavior: only process tokens routed to this expert. + if not has_tokens: + continue + expert_output = expert(hidden_states[token_indices]) + + # Common logic for combining expert outputs + expert_weights = topk_weights[token_indices, weight_indices] + weighted_output = expert_output * expert_weights.unsqueeze(-1) + final_hidden_states.index_add_(0, token_indices, weighted_output) # End MoE hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape) From c908f777513ff8549ec06ec11567f48cdeefe46c Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Mon, 5 Jan 2026 12:26:02 -0600 Subject: [PATCH 05/17] Adding example script for GLM-4.7 Quanting. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4.7_example.py | 512 ++++++++++++++++++++++ 1 file changed, 512 insertions(+) create mode 100644 examples/quantizing_moe/glm4.7_example.py diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4.7_example.py new file mode 100644 index 0000000000..3c9c1b6ecf --- /dev/null +++ b/examples/quantizing_moe/glm4.7_example.py @@ -0,0 +1,512 @@ +import os +from pathlib import Path + +from datasets import load_dataset, concatenate_datasets +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.awq import AWQModifier, AWQMapping +from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE + +# This script does W4A16 AWQ quantization of the GLM-4.7 model. It uses Group Size of 32 and two datasets (one specific for quantization and one for reasoning models) +# Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB of VRAM used and roughly ~3.5 hours of run time. +# This model script uses the glm4 modeling file to make sure that for each calibration sample, all experts are engaged. +# This script also uses a local .ENV file, for Source and Destination. Change as needed. +# GLM 4.7 has Dense layers for the first three layers, so we skip multiple sections of those layers. We then need to add all of that to a mapping, to apply it during quantization. + + +# ========================= +# Load ENV Variables +# ========================= +from dotenv import load_dotenv + +# Load the .env that sits next to this script (works regardless of where you run it) +# The .env file should be in the directory this script is run from and should look like the following: +# SRC_DIR=/media/fmodels/zai-org/GLM-4.7/ +# DST_DIR=/media/fmodels/TheHouseOfTheDude/GLM-4.7_Compressed-Tensors/W4A16_GS32 +# Those two lines are all that's needed. +load_dotenv(Path(__file__).with_name(".env")) + +def require_env(key: str) -> str: + val = os.getenv(key) + if not val or not val.strip(): + raise RuntimeError(f"Missing environment variable: {key}") + return val.strip() + +SRC_DIR = require_env("SRC_DIR") +DST_DIR = require_env("DST_DIR") + +# ========================= +# Model (GLM / GLM-MoE) +# ========================= +MODEL_ID = require_env("SRC_DIR") +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) + +# ========================= +# Calibration data (Neural Magic + Rombo Optimized Reasoning) +# ========================= +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Calculate sample distribution: 60% Neural Magic, 40% Rombo +NUM_NEURALMAGIC = int(NUM_CALIBRATION_SAMPLES * 0.6) # ~307 samples +NUM_ROMBO = NUM_CALIBRATION_SAMPLES - NUM_NEURALMAGIC # ~205 samples + +print(f"Loading calibration datasets: {NUM_NEURALMAGIC} from Neural Magic, {NUM_ROMBO} from Rombo") + +# Load Neural Magic dataset +neuralmagic_dataset_id = "neuralmagic/LLM_compression_calibration" +neuralmagic_split = "train" +ds_neuralmagic = load_dataset(neuralmagic_dataset_id, split=neuralmagic_split) + +# Sample from Neural Magic dataset +n_nm = min(NUM_NEURALMAGIC, len(ds_neuralmagic)) +ds_neuralmagic = ds_neuralmagic.shuffle(seed=42).select(range(n_nm)) + +# Render messages to chat-style text (batch) +# The neuralmagic dataset has "messages" field with user/assistant roles +def preprocess_neuralmagic(batch): + rendered = [] + for messages in batch["messages"]: + # Apply chat template to the messages directly + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + ) + rendered.append(text) + return {"text": rendered} + +ds_neuralmagic = ds_neuralmagic.map(preprocess_neuralmagic, batched=True, num_proc=4) + +# Load Rombo Optimized Reasoning dataset +rombo_dataset_id = "Rombo-Org/Optimized_Reasoning" +rombo_split = "train" +ds_rombo = load_dataset(rombo_dataset_id, split=rombo_split) + +# Sample from Rombo dataset +n_rombo = min(NUM_ROMBO, len(ds_rombo)) +ds_rombo = ds_rombo.shuffle(seed=43).select(range(n_rombo)) + +# Preprocess Rombo dataset +# Format: {"instruction": "", "input": [""], "output": [""]} +def preprocess_rombo(batch): + rendered = [] + for instruction, inputs, outputs in zip(batch["instruction"], batch["input"], batch["output"]): + # Construct text from instruction, input, and output + # Combine instruction with all input/output pairs + text_parts = [instruction] + + # Handle input array (may contain multiple items) + if isinstance(inputs, list) and len(inputs) > 0: + for inp in inputs: + if inp and inp.strip(): + text_parts.append(f"\n\nInput: {inp}") + + # Handle output array (may contain multiple items) + if isinstance(outputs, list) and len(outputs) > 0: + for out in outputs: + if out and out.strip(): + text_parts.append(f"\n\nOutput: {out}") + + # Join all parts + text = "".join(text_parts) + rendered.append(text) + return {"text": rendered} + +ds_rombo = ds_rombo.map(preprocess_rombo, batched=True, num_proc=4) + +# Combine both datasets +ds = concatenate_datasets([ds_neuralmagic, ds_rombo]) + +# Shuffle the combined dataset +ds = ds.shuffle(seed=44) + +# Tokenize in batches +ds = ds.map( + lambda batch: tokenizer( + batch["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ), + batched=True, + remove_columns=ds.column_names, + num_proc=4, +) + +print(f"Combined calibration dataset: {len(ds)} samples") + +# ========================= +# AWQ recipe with config_groups +# - Weight-only INT4 (W4A16 **symmetric**) +# - group_size: 32 +# - IMPORTANT: do NOT ignore mlp.gate / gate_up_proj (merged layer) +# - Keep router and output head unquantized +# ========================= + +moe_ignores = [ + # Layer 0: Dense layer - ignore attention and MLP + "model.layers.0.self_attn.q_proj", + "model.layers.0.self_attn.k_proj", + "model.layers.0.self_attn.v_proj", + "model.layers.0.self_attn.o_proj", + "model.layers.0.mlp.gate_proj", + "model.layers.0.mlp.up_proj", + "model.layers.0.mlp.down_proj", + + # Layer 1: Dense layer - ignore attention and MLP + "model.layers.1.self_attn.q_proj", + "model.layers.1.self_attn.k_proj", + "model.layers.1.self_attn.v_proj", + "model.layers.1.self_attn.o_proj", + "model.layers.1.mlp.gate_proj", + "model.layers.1.mlp.up_proj", + "model.layers.1.mlp.down_proj", + + # Layer 2: Dense layer - ignore attention and MLP + "model.layers.2.self_attn.q_proj", + "model.layers.2.self_attn.k_proj", + "model.layers.2.self_attn.v_proj", + "model.layers.2.self_attn.o_proj", + "model.layers.2.mlp.gate_proj", + "model.layers.2.mlp.up_proj", + "model.layers.2.mlp.down_proj", + + # Layers 3-91: MoE layers - ignore shared_experts + "model.layers.3.mlp.shared_experts.gate_proj", + "model.layers.3.mlp.shared_experts.up_proj", + "model.layers.3.mlp.shared_experts.down_proj", + "model.layers.4.mlp.shared_experts.gate_proj", + "model.layers.4.mlp.shared_experts.up_proj", + "model.layers.4.mlp.shared_experts.down_proj", + "model.layers.5.mlp.shared_experts.gate_proj", + "model.layers.5.mlp.shared_experts.up_proj", + "model.layers.5.mlp.shared_experts.down_proj", + "model.layers.6.mlp.shared_experts.gate_proj", + "model.layers.6.mlp.shared_experts.up_proj", + "model.layers.6.mlp.shared_experts.down_proj", + "model.layers.7.mlp.shared_experts.gate_proj", + "model.layers.7.mlp.shared_experts.up_proj", + "model.layers.7.mlp.shared_experts.down_proj", + "model.layers.8.mlp.shared_experts.gate_proj", + "model.layers.8.mlp.shared_experts.up_proj", + "model.layers.8.mlp.shared_experts.down_proj", + "model.layers.9.mlp.shared_experts.gate_proj", + "model.layers.9.mlp.shared_experts.up_proj", + "model.layers.9.mlp.shared_experts.down_proj", + "model.layers.10.mlp.shared_experts.gate_proj", + "model.layers.10.mlp.shared_experts.up_proj", + "model.layers.10.mlp.shared_experts.down_proj", + "model.layers.11.mlp.shared_experts.gate_proj", + "model.layers.11.mlp.shared_experts.up_proj", + "model.layers.11.mlp.shared_experts.down_proj", + "model.layers.12.mlp.shared_experts.gate_proj", + "model.layers.12.mlp.shared_experts.up_proj", + "model.layers.12.mlp.shared_experts.down_proj", + "model.layers.13.mlp.shared_experts.gate_proj", + "model.layers.13.mlp.shared_experts.up_proj", + "model.layers.13.mlp.shared_experts.down_proj", + "model.layers.14.mlp.shared_experts.gate_proj", + "model.layers.14.mlp.shared_experts.up_proj", + "model.layers.14.mlp.shared_experts.down_proj", + "model.layers.15.mlp.shared_experts.gate_proj", + "model.layers.15.mlp.shared_experts.up_proj", + "model.layers.15.mlp.shared_experts.down_proj", + "model.layers.16.mlp.shared_experts.gate_proj", + "model.layers.16.mlp.shared_experts.up_proj", + "model.layers.16.mlp.shared_experts.down_proj", + "model.layers.17.mlp.shared_experts.gate_proj", + "model.layers.17.mlp.shared_experts.up_proj", + "model.layers.17.mlp.shared_experts.down_proj", + "model.layers.18.mlp.shared_experts.gate_proj", + "model.layers.18.mlp.shared_experts.up_proj", + "model.layers.18.mlp.shared_experts.down_proj", + "model.layers.19.mlp.shared_experts.gate_proj", + "model.layers.19.mlp.shared_experts.up_proj", + "model.layers.19.mlp.shared_experts.down_proj", + "model.layers.20.mlp.shared_experts.gate_proj", + "model.layers.20.mlp.shared_experts.up_proj", + "model.layers.20.mlp.shared_experts.down_proj", + "model.layers.21.mlp.shared_experts.gate_proj", + "model.layers.21.mlp.shared_experts.up_proj", + "model.layers.21.mlp.shared_experts.down_proj", + "model.layers.22.mlp.shared_experts.gate_proj", + "model.layers.22.mlp.shared_experts.up_proj", + "model.layers.22.mlp.shared_experts.down_proj", + "model.layers.23.mlp.shared_experts.gate_proj", + "model.layers.23.mlp.shared_experts.up_proj", + "model.layers.23.mlp.shared_experts.down_proj", + "model.layers.24.mlp.shared_experts.gate_proj", + "model.layers.24.mlp.shared_experts.up_proj", + "model.layers.24.mlp.shared_experts.down_proj", + "model.layers.25.mlp.shared_experts.gate_proj", + "model.layers.25.mlp.shared_experts.up_proj", + "model.layers.25.mlp.shared_experts.down_proj", + "model.layers.26.mlp.shared_experts.gate_proj", + "model.layers.26.mlp.shared_experts.up_proj", + "model.layers.26.mlp.shared_experts.down_proj", + "model.layers.27.mlp.shared_experts.gate_proj", + "model.layers.27.mlp.shared_experts.up_proj", + "model.layers.27.mlp.shared_experts.down_proj", + "model.layers.28.mlp.shared_experts.gate_proj", + "model.layers.28.mlp.shared_experts.up_proj", + "model.layers.28.mlp.shared_experts.down_proj", + "model.layers.29.mlp.shared_experts.gate_proj", + "model.layers.29.mlp.shared_experts.up_proj", + "model.layers.29.mlp.shared_experts.down_proj", + "model.layers.30.mlp.shared_experts.gate_proj", + "model.layers.30.mlp.shared_experts.up_proj", + "model.layers.30.mlp.shared_experts.down_proj", + "model.layers.31.mlp.shared_experts.gate_proj", + "model.layers.31.mlp.shared_experts.up_proj", + "model.layers.31.mlp.shared_experts.down_proj", + "model.layers.32.mlp.shared_experts.gate_proj", + "model.layers.32.mlp.shared_experts.up_proj", + "model.layers.32.mlp.shared_experts.down_proj", + "model.layers.33.mlp.shared_experts.gate_proj", + "model.layers.33.mlp.shared_experts.up_proj", + "model.layers.33.mlp.shared_experts.down_proj", + "model.layers.34.mlp.shared_experts.gate_proj", + "model.layers.34.mlp.shared_experts.up_proj", + "model.layers.34.mlp.shared_experts.down_proj", + "model.layers.35.mlp.shared_experts.gate_proj", + "model.layers.35.mlp.shared_experts.up_proj", + "model.layers.35.mlp.shared_experts.down_proj", + "model.layers.36.mlp.shared_experts.gate_proj", + "model.layers.36.mlp.shared_experts.up_proj", + "model.layers.36.mlp.shared_experts.down_proj", + "model.layers.37.mlp.shared_experts.gate_proj", + "model.layers.37.mlp.shared_experts.up_proj", + "model.layers.37.mlp.shared_experts.down_proj", + "model.layers.38.mlp.shared_experts.gate_proj", + "model.layers.38.mlp.shared_experts.up_proj", + "model.layers.38.mlp.shared_experts.down_proj", + "model.layers.39.mlp.shared_experts.gate_proj", + "model.layers.39.mlp.shared_experts.up_proj", + "model.layers.39.mlp.shared_experts.down_proj", + "model.layers.40.mlp.shared_experts.gate_proj", + "model.layers.40.mlp.shared_experts.up_proj", + "model.layers.40.mlp.shared_experts.down_proj", + "model.layers.41.mlp.shared_experts.gate_proj", + "model.layers.41.mlp.shared_experts.up_proj", + "model.layers.41.mlp.shared_experts.down_proj", + "model.layers.42.mlp.shared_experts.gate_proj", + "model.layers.42.mlp.shared_experts.up_proj", + "model.layers.42.mlp.shared_experts.down_proj", + "model.layers.43.mlp.shared_experts.gate_proj", + "model.layers.43.mlp.shared_experts.up_proj", + "model.layers.43.mlp.shared_experts.down_proj", + "model.layers.44.mlp.shared_experts.gate_proj", + "model.layers.44.mlp.shared_experts.up_proj", + "model.layers.44.mlp.shared_experts.down_proj", + "model.layers.45.mlp.shared_experts.gate_proj", + "model.layers.45.mlp.shared_experts.up_proj", + "model.layers.45.mlp.shared_experts.down_proj", + "model.layers.46.mlp.shared_experts.gate_proj", + "model.layers.46.mlp.shared_experts.up_proj", + "model.layers.46.mlp.shared_experts.down_proj", + "model.layers.47.mlp.shared_experts.gate_proj", + "model.layers.47.mlp.shared_experts.up_proj", + "model.layers.47.mlp.shared_experts.down_proj", + "model.layers.48.mlp.shared_experts.gate_proj", + "model.layers.48.mlp.shared_experts.up_proj", + "model.layers.48.mlp.shared_experts.down_proj", + "model.layers.49.mlp.shared_experts.gate_proj", + "model.layers.49.mlp.shared_experts.up_proj", + "model.layers.49.mlp.shared_experts.down_proj", + "model.layers.50.mlp.shared_experts.gate_proj", + "model.layers.50.mlp.shared_experts.up_proj", + "model.layers.50.mlp.shared_experts.down_proj", + "model.layers.51.mlp.shared_experts.gate_proj", + "model.layers.51.mlp.shared_experts.up_proj", + "model.layers.51.mlp.shared_experts.down_proj", + "model.layers.52.mlp.shared_experts.gate_proj", + "model.layers.52.mlp.shared_experts.up_proj", + "model.layers.52.mlp.shared_experts.down_proj", + "model.layers.53.mlp.shared_experts.gate_proj", + "model.layers.53.mlp.shared_experts.up_proj", + "model.layers.53.mlp.shared_experts.down_proj", + "model.layers.54.mlp.shared_experts.gate_proj", + "model.layers.54.mlp.shared_experts.up_proj", + "model.layers.54.mlp.shared_experts.down_proj", + "model.layers.55.mlp.shared_experts.gate_proj", + "model.layers.55.mlp.shared_experts.up_proj", + "model.layers.55.mlp.shared_experts.down_proj", + "model.layers.56.mlp.shared_experts.gate_proj", + "model.layers.56.mlp.shared_experts.up_proj", + "model.layers.56.mlp.shared_experts.down_proj", + "model.layers.57.mlp.shared_experts.gate_proj", + "model.layers.57.mlp.shared_experts.up_proj", + "model.layers.57.mlp.shared_experts.down_proj", + "model.layers.58.mlp.shared_experts.gate_proj", + "model.layers.58.mlp.shared_experts.up_proj", + "model.layers.58.mlp.shared_experts.down_proj", + "model.layers.59.mlp.shared_experts.gate_proj", + "model.layers.59.mlp.shared_experts.up_proj", + "model.layers.59.mlp.shared_experts.down_proj", + "model.layers.60.mlp.shared_experts.gate_proj", + "model.layers.60.mlp.shared_experts.up_proj", + "model.layers.60.mlp.shared_experts.down_proj", + "model.layers.61.mlp.shared_experts.gate_proj", + "model.layers.61.mlp.shared_experts.up_proj", + "model.layers.61.mlp.shared_experts.down_proj", + "model.layers.62.mlp.shared_experts.gate_proj", + "model.layers.62.mlp.shared_experts.up_proj", + "model.layers.62.mlp.shared_experts.down_proj", + "model.layers.63.mlp.shared_experts.gate_proj", + "model.layers.63.mlp.shared_experts.up_proj", + "model.layers.63.mlp.shared_experts.down_proj", + "model.layers.64.mlp.shared_experts.gate_proj", + "model.layers.64.mlp.shared_experts.up_proj", + "model.layers.64.mlp.shared_experts.down_proj", + "model.layers.65.mlp.shared_experts.gate_proj", + "model.layers.65.mlp.shared_experts.up_proj", + "model.layers.65.mlp.shared_experts.down_proj", + "model.layers.66.mlp.shared_experts.gate_proj", + "model.layers.66.mlp.shared_experts.up_proj", + "model.layers.66.mlp.shared_experts.down_proj", + "model.layers.67.mlp.shared_experts.gate_proj", + "model.layers.67.mlp.shared_experts.up_proj", + "model.layers.67.mlp.shared_experts.down_proj", + "model.layers.68.mlp.shared_experts.gate_proj", + "model.layers.68.mlp.shared_experts.up_proj", + "model.layers.68.mlp.shared_experts.down_proj", + "model.layers.69.mlp.shared_experts.gate_proj", + "model.layers.69.mlp.shared_experts.up_proj", + "model.layers.69.mlp.shared_experts.down_proj", + "model.layers.70.mlp.shared_experts.gate_proj", + "model.layers.70.mlp.shared_experts.up_proj", + "model.layers.70.mlp.shared_experts.down_proj", + "model.layers.71.mlp.shared_experts.gate_proj", + "model.layers.71.mlp.shared_experts.up_proj", + "model.layers.71.mlp.shared_experts.down_proj", + "model.layers.72.mlp.shared_experts.gate_proj", + "model.layers.72.mlp.shared_experts.up_proj", + "model.layers.72.mlp.shared_experts.down_proj", + "model.layers.73.mlp.shared_experts.gate_proj", + "model.layers.73.mlp.shared_experts.up_proj", + "model.layers.73.mlp.shared_experts.down_proj", + "model.layers.74.mlp.shared_experts.gate_proj", + "model.layers.74.mlp.shared_experts.up_proj", + "model.layers.74.mlp.shared_experts.down_proj", + "model.layers.75.mlp.shared_experts.gate_proj", + "model.layers.75.mlp.shared_experts.up_proj", + "model.layers.75.mlp.shared_experts.down_proj", + "model.layers.76.mlp.shared_experts.gate_proj", + "model.layers.76.mlp.shared_experts.up_proj", + "model.layers.76.mlp.shared_experts.down_proj", + "model.layers.77.mlp.shared_experts.gate_proj", + "model.layers.77.mlp.shared_experts.up_proj", + "model.layers.77.mlp.shared_experts.down_proj", + "model.layers.78.mlp.shared_experts.gate_proj", + "model.layers.78.mlp.shared_experts.up_proj", + "model.layers.78.mlp.shared_experts.down_proj", + "model.layers.79.mlp.shared_experts.gate_proj", + "model.layers.79.mlp.shared_experts.up_proj", + "model.layers.79.mlp.shared_experts.down_proj", + "model.layers.80.mlp.shared_experts.gate_proj", + "model.layers.80.mlp.shared_experts.up_proj", + "model.layers.80.mlp.shared_experts.down_proj", + "model.layers.81.mlp.shared_experts.gate_proj", + "model.layers.81.mlp.shared_experts.up_proj", + "model.layers.81.mlp.shared_experts.down_proj", + "model.layers.82.mlp.shared_experts.gate_proj", + "model.layers.82.mlp.shared_experts.up_proj", + "model.layers.82.mlp.shared_experts.down_proj", + "model.layers.83.mlp.shared_experts.gate_proj", + "model.layers.83.mlp.shared_experts.up_proj", + "model.layers.83.mlp.shared_experts.down_proj", + "model.layers.84.mlp.shared_experts.gate_proj", + "model.layers.84.mlp.shared_experts.up_proj", + "model.layers.84.mlp.shared_experts.down_proj", + "model.layers.85.mlp.shared_experts.gate_proj", + "model.layers.85.mlp.shared_experts.up_proj", + "model.layers.85.mlp.shared_experts.down_proj", + "model.layers.86.mlp.shared_experts.gate_proj", + "model.layers.86.mlp.shared_experts.up_proj", + "model.layers.86.mlp.shared_experts.down_proj", + "model.layers.87.mlp.shared_experts.gate_proj", + "model.layers.87.mlp.shared_experts.up_proj", + "model.layers.87.mlp.shared_experts.down_proj", + "model.layers.88.mlp.shared_experts.gate_proj", + "model.layers.88.mlp.shared_experts.up_proj", + "model.layers.88.mlp.shared_experts.down_proj", + "model.layers.89.mlp.shared_experts.gate_proj", + "model.layers.89.mlp.shared_experts.up_proj", + "model.layers.89.mlp.shared_experts.down_proj", + "model.layers.90.mlp.shared_experts.gate_proj", + "model.layers.90.mlp.shared_experts.up_proj", + "model.layers.90.mlp.shared_experts.down_proj", + "model.layers.91.mlp.shared_experts.gate_proj", + "model.layers.91.mlp.shared_experts.up_proj", + "model.layers.91.mlp.shared_experts.down_proj", + + # Ignore the output head + "lm_head", +] + +# Create explicit mappings that skip layers 0-2 +mappings = [] +for layer_idx in range(3, 92): # Skip dense layers 0-2 + mappings.append( + AWQMapping( + smooth_layer=f"model.layers.{layer_idx}.input_layernorm", + balance_layers=[ + f"model.layers.{layer_idx}.self_attn.q_proj", + f"model.layers.{layer_idx}.self_attn.k_proj", + f"model.layers.{layer_idx}.self_attn.v_proj", + ] + ) + ) + +recipe = [ + AWQModifier( + ignore=moe_ignores, + mappings=mappings, # Provide explicit mappings + config_groups={ + "group_0": { + "targets": ["Linear"], + "weights": { + "num_bits": 4, + "type": "int", + "symmetric": True, # W4A16 (symmetric) + "strategy": "group", + "group_size": 32, + "dynamic": False, + }, + }, + }, + ), +] + +# ========================= +# Quantize + save (writes quantization_config for vLLM) +# ========================= +SAVE_DIR = require_env("DST_DIR") + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +# output_dir=SAVE_DIR, +) + +# Fix generation config validation issue before saving +if hasattr(model, 'generation_config') and model.generation_config is not None: + # If temperature is set but do_sample is False, either enable do_sample or remove temperature + if hasattr(model.generation_config, 'temperature') and model.generation_config.temperature is not None: + if not getattr(model.generation_config, 'do_sample', False): + # Set do_sample=True to make temperature valid, or remove temperature + model.generation_config.do_sample = True + +# (Optional redundant save) +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + +print("Saved to:", SAVE_DIR) + From d9913c9dcdeae12f61fd61777c644620a8c6f14f Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 6 Jan 2026 11:47:51 -0600 Subject: [PATCH 06/17] Changing Ignore of first three dense layers to Regex matching pattern. Co-authored-by: Brian Dellabetta Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4.7_example.py | 29 +++-------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4.7_example.py index 3c9c1b6ecf..3f57053360 100644 --- a/examples/quantizing_moe/glm4.7_example.py +++ b/examples/quantizing_moe/glm4.7_example.py @@ -148,32 +148,9 @@ def preprocess_rombo(batch): # ========================= moe_ignores = [ - # Layer 0: Dense layer - ignore attention and MLP - "model.layers.0.self_attn.q_proj", - "model.layers.0.self_attn.k_proj", - "model.layers.0.self_attn.v_proj", - "model.layers.0.self_attn.o_proj", - "model.layers.0.mlp.gate_proj", - "model.layers.0.mlp.up_proj", - "model.layers.0.mlp.down_proj", - - # Layer 1: Dense layer - ignore attention and MLP - "model.layers.1.self_attn.q_proj", - "model.layers.1.self_attn.k_proj", - "model.layers.1.self_attn.v_proj", - "model.layers.1.self_attn.o_proj", - "model.layers.1.mlp.gate_proj", - "model.layers.1.mlp.up_proj", - "model.layers.1.mlp.down_proj", - - # Layer 2: Dense layer - ignore attention and MLP - "model.layers.2.self_attn.q_proj", - "model.layers.2.self_attn.k_proj", - "model.layers.2.self_attn.v_proj", - "model.layers.2.self_attn.o_proj", - "model.layers.2.mlp.gate_proj", - "model.layers.2.mlp.up_proj", - "model.layers.2.mlp.down_proj", + # Layers 0-2: Dense layer - ignore attention and MLP + "model.layers.[0-2].self_attn.(q|k|v|o)_proj", + "model.layers.[0-2].mlp.(gate|up|down)_proj", # Layers 3-91: MoE layers - ignore shared_experts "model.layers.3.mlp.shared_experts.gate_proj", From f4d1e9502d8856d38e1a29f1391d01cf9c4a19b7 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 6 Jan 2026 11:48:43 -0600 Subject: [PATCH 07/17] Updating non dense layers to also use a Regex in the Ignore Section. Co-authored-by: Brian Dellabetta Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4.7_example.py | 268 +--------------------- 1 file changed, 1 insertion(+), 267 deletions(-) diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4.7_example.py index 3f57053360..35dfca4a4b 100644 --- a/examples/quantizing_moe/glm4.7_example.py +++ b/examples/quantizing_moe/glm4.7_example.py @@ -153,273 +153,7 @@ def preprocess_rombo(batch): "model.layers.[0-2].mlp.(gate|up|down)_proj", # Layers 3-91: MoE layers - ignore shared_experts - "model.layers.3.mlp.shared_experts.gate_proj", - "model.layers.3.mlp.shared_experts.up_proj", - "model.layers.3.mlp.shared_experts.down_proj", - "model.layers.4.mlp.shared_experts.gate_proj", - "model.layers.4.mlp.shared_experts.up_proj", - "model.layers.4.mlp.shared_experts.down_proj", - "model.layers.5.mlp.shared_experts.gate_proj", - "model.layers.5.mlp.shared_experts.up_proj", - "model.layers.5.mlp.shared_experts.down_proj", - "model.layers.6.mlp.shared_experts.gate_proj", - "model.layers.6.mlp.shared_experts.up_proj", - "model.layers.6.mlp.shared_experts.down_proj", - "model.layers.7.mlp.shared_experts.gate_proj", - "model.layers.7.mlp.shared_experts.up_proj", - "model.layers.7.mlp.shared_experts.down_proj", - "model.layers.8.mlp.shared_experts.gate_proj", - "model.layers.8.mlp.shared_experts.up_proj", - "model.layers.8.mlp.shared_experts.down_proj", - "model.layers.9.mlp.shared_experts.gate_proj", - "model.layers.9.mlp.shared_experts.up_proj", - "model.layers.9.mlp.shared_experts.down_proj", - "model.layers.10.mlp.shared_experts.gate_proj", - "model.layers.10.mlp.shared_experts.up_proj", - "model.layers.10.mlp.shared_experts.down_proj", - "model.layers.11.mlp.shared_experts.gate_proj", - "model.layers.11.mlp.shared_experts.up_proj", - "model.layers.11.mlp.shared_experts.down_proj", - "model.layers.12.mlp.shared_experts.gate_proj", - "model.layers.12.mlp.shared_experts.up_proj", - "model.layers.12.mlp.shared_experts.down_proj", - "model.layers.13.mlp.shared_experts.gate_proj", - "model.layers.13.mlp.shared_experts.up_proj", - "model.layers.13.mlp.shared_experts.down_proj", - "model.layers.14.mlp.shared_experts.gate_proj", - "model.layers.14.mlp.shared_experts.up_proj", - "model.layers.14.mlp.shared_experts.down_proj", - "model.layers.15.mlp.shared_experts.gate_proj", - "model.layers.15.mlp.shared_experts.up_proj", - "model.layers.15.mlp.shared_experts.down_proj", - "model.layers.16.mlp.shared_experts.gate_proj", - "model.layers.16.mlp.shared_experts.up_proj", - "model.layers.16.mlp.shared_experts.down_proj", - "model.layers.17.mlp.shared_experts.gate_proj", - "model.layers.17.mlp.shared_experts.up_proj", - "model.layers.17.mlp.shared_experts.down_proj", - "model.layers.18.mlp.shared_experts.gate_proj", - "model.layers.18.mlp.shared_experts.up_proj", - "model.layers.18.mlp.shared_experts.down_proj", - "model.layers.19.mlp.shared_experts.gate_proj", - "model.layers.19.mlp.shared_experts.up_proj", - "model.layers.19.mlp.shared_experts.down_proj", - "model.layers.20.mlp.shared_experts.gate_proj", - "model.layers.20.mlp.shared_experts.up_proj", - "model.layers.20.mlp.shared_experts.down_proj", - "model.layers.21.mlp.shared_experts.gate_proj", - "model.layers.21.mlp.shared_experts.up_proj", - "model.layers.21.mlp.shared_experts.down_proj", - "model.layers.22.mlp.shared_experts.gate_proj", - "model.layers.22.mlp.shared_experts.up_proj", - "model.layers.22.mlp.shared_experts.down_proj", - "model.layers.23.mlp.shared_experts.gate_proj", - "model.layers.23.mlp.shared_experts.up_proj", - "model.layers.23.mlp.shared_experts.down_proj", - "model.layers.24.mlp.shared_experts.gate_proj", - "model.layers.24.mlp.shared_experts.up_proj", - "model.layers.24.mlp.shared_experts.down_proj", - "model.layers.25.mlp.shared_experts.gate_proj", - "model.layers.25.mlp.shared_experts.up_proj", - "model.layers.25.mlp.shared_experts.down_proj", - "model.layers.26.mlp.shared_experts.gate_proj", - "model.layers.26.mlp.shared_experts.up_proj", - "model.layers.26.mlp.shared_experts.down_proj", - "model.layers.27.mlp.shared_experts.gate_proj", - "model.layers.27.mlp.shared_experts.up_proj", - "model.layers.27.mlp.shared_experts.down_proj", - "model.layers.28.mlp.shared_experts.gate_proj", - "model.layers.28.mlp.shared_experts.up_proj", - "model.layers.28.mlp.shared_experts.down_proj", - "model.layers.29.mlp.shared_experts.gate_proj", - "model.layers.29.mlp.shared_experts.up_proj", - "model.layers.29.mlp.shared_experts.down_proj", - "model.layers.30.mlp.shared_experts.gate_proj", - "model.layers.30.mlp.shared_experts.up_proj", - "model.layers.30.mlp.shared_experts.down_proj", - "model.layers.31.mlp.shared_experts.gate_proj", - "model.layers.31.mlp.shared_experts.up_proj", - "model.layers.31.mlp.shared_experts.down_proj", - "model.layers.32.mlp.shared_experts.gate_proj", - "model.layers.32.mlp.shared_experts.up_proj", - "model.layers.32.mlp.shared_experts.down_proj", - "model.layers.33.mlp.shared_experts.gate_proj", - "model.layers.33.mlp.shared_experts.up_proj", - "model.layers.33.mlp.shared_experts.down_proj", - "model.layers.34.mlp.shared_experts.gate_proj", - "model.layers.34.mlp.shared_experts.up_proj", - "model.layers.34.mlp.shared_experts.down_proj", - "model.layers.35.mlp.shared_experts.gate_proj", - "model.layers.35.mlp.shared_experts.up_proj", - "model.layers.35.mlp.shared_experts.down_proj", - "model.layers.36.mlp.shared_experts.gate_proj", - "model.layers.36.mlp.shared_experts.up_proj", - "model.layers.36.mlp.shared_experts.down_proj", - "model.layers.37.mlp.shared_experts.gate_proj", - "model.layers.37.mlp.shared_experts.up_proj", - "model.layers.37.mlp.shared_experts.down_proj", - "model.layers.38.mlp.shared_experts.gate_proj", - "model.layers.38.mlp.shared_experts.up_proj", - "model.layers.38.mlp.shared_experts.down_proj", - "model.layers.39.mlp.shared_experts.gate_proj", - "model.layers.39.mlp.shared_experts.up_proj", - "model.layers.39.mlp.shared_experts.down_proj", - "model.layers.40.mlp.shared_experts.gate_proj", - "model.layers.40.mlp.shared_experts.up_proj", - "model.layers.40.mlp.shared_experts.down_proj", - "model.layers.41.mlp.shared_experts.gate_proj", - "model.layers.41.mlp.shared_experts.up_proj", - "model.layers.41.mlp.shared_experts.down_proj", - "model.layers.42.mlp.shared_experts.gate_proj", - "model.layers.42.mlp.shared_experts.up_proj", - "model.layers.42.mlp.shared_experts.down_proj", - "model.layers.43.mlp.shared_experts.gate_proj", - "model.layers.43.mlp.shared_experts.up_proj", - "model.layers.43.mlp.shared_experts.down_proj", - "model.layers.44.mlp.shared_experts.gate_proj", - "model.layers.44.mlp.shared_experts.up_proj", - "model.layers.44.mlp.shared_experts.down_proj", - "model.layers.45.mlp.shared_experts.gate_proj", - "model.layers.45.mlp.shared_experts.up_proj", - "model.layers.45.mlp.shared_experts.down_proj", - "model.layers.46.mlp.shared_experts.gate_proj", - "model.layers.46.mlp.shared_experts.up_proj", - "model.layers.46.mlp.shared_experts.down_proj", - "model.layers.47.mlp.shared_experts.gate_proj", - "model.layers.47.mlp.shared_experts.up_proj", - "model.layers.47.mlp.shared_experts.down_proj", - "model.layers.48.mlp.shared_experts.gate_proj", - "model.layers.48.mlp.shared_experts.up_proj", - "model.layers.48.mlp.shared_experts.down_proj", - "model.layers.49.mlp.shared_experts.gate_proj", - "model.layers.49.mlp.shared_experts.up_proj", - "model.layers.49.mlp.shared_experts.down_proj", - "model.layers.50.mlp.shared_experts.gate_proj", - "model.layers.50.mlp.shared_experts.up_proj", - "model.layers.50.mlp.shared_experts.down_proj", - "model.layers.51.mlp.shared_experts.gate_proj", - "model.layers.51.mlp.shared_experts.up_proj", - "model.layers.51.mlp.shared_experts.down_proj", - "model.layers.52.mlp.shared_experts.gate_proj", - "model.layers.52.mlp.shared_experts.up_proj", - "model.layers.52.mlp.shared_experts.down_proj", - "model.layers.53.mlp.shared_experts.gate_proj", - "model.layers.53.mlp.shared_experts.up_proj", - "model.layers.53.mlp.shared_experts.down_proj", - "model.layers.54.mlp.shared_experts.gate_proj", - "model.layers.54.mlp.shared_experts.up_proj", - "model.layers.54.mlp.shared_experts.down_proj", - "model.layers.55.mlp.shared_experts.gate_proj", - "model.layers.55.mlp.shared_experts.up_proj", - "model.layers.55.mlp.shared_experts.down_proj", - "model.layers.56.mlp.shared_experts.gate_proj", - "model.layers.56.mlp.shared_experts.up_proj", - "model.layers.56.mlp.shared_experts.down_proj", - "model.layers.57.mlp.shared_experts.gate_proj", - "model.layers.57.mlp.shared_experts.up_proj", - "model.layers.57.mlp.shared_experts.down_proj", - "model.layers.58.mlp.shared_experts.gate_proj", - "model.layers.58.mlp.shared_experts.up_proj", - "model.layers.58.mlp.shared_experts.down_proj", - "model.layers.59.mlp.shared_experts.gate_proj", - "model.layers.59.mlp.shared_experts.up_proj", - "model.layers.59.mlp.shared_experts.down_proj", - "model.layers.60.mlp.shared_experts.gate_proj", - "model.layers.60.mlp.shared_experts.up_proj", - "model.layers.60.mlp.shared_experts.down_proj", - "model.layers.61.mlp.shared_experts.gate_proj", - "model.layers.61.mlp.shared_experts.up_proj", - "model.layers.61.mlp.shared_experts.down_proj", - "model.layers.62.mlp.shared_experts.gate_proj", - "model.layers.62.mlp.shared_experts.up_proj", - "model.layers.62.mlp.shared_experts.down_proj", - "model.layers.63.mlp.shared_experts.gate_proj", - "model.layers.63.mlp.shared_experts.up_proj", - "model.layers.63.mlp.shared_experts.down_proj", - "model.layers.64.mlp.shared_experts.gate_proj", - "model.layers.64.mlp.shared_experts.up_proj", - "model.layers.64.mlp.shared_experts.down_proj", - "model.layers.65.mlp.shared_experts.gate_proj", - "model.layers.65.mlp.shared_experts.up_proj", - "model.layers.65.mlp.shared_experts.down_proj", - "model.layers.66.mlp.shared_experts.gate_proj", - "model.layers.66.mlp.shared_experts.up_proj", - "model.layers.66.mlp.shared_experts.down_proj", - "model.layers.67.mlp.shared_experts.gate_proj", - "model.layers.67.mlp.shared_experts.up_proj", - "model.layers.67.mlp.shared_experts.down_proj", - "model.layers.68.mlp.shared_experts.gate_proj", - "model.layers.68.mlp.shared_experts.up_proj", - "model.layers.68.mlp.shared_experts.down_proj", - "model.layers.69.mlp.shared_experts.gate_proj", - "model.layers.69.mlp.shared_experts.up_proj", - "model.layers.69.mlp.shared_experts.down_proj", - "model.layers.70.mlp.shared_experts.gate_proj", - "model.layers.70.mlp.shared_experts.up_proj", - "model.layers.70.mlp.shared_experts.down_proj", - "model.layers.71.mlp.shared_experts.gate_proj", - "model.layers.71.mlp.shared_experts.up_proj", - "model.layers.71.mlp.shared_experts.down_proj", - "model.layers.72.mlp.shared_experts.gate_proj", - "model.layers.72.mlp.shared_experts.up_proj", - "model.layers.72.mlp.shared_experts.down_proj", - "model.layers.73.mlp.shared_experts.gate_proj", - "model.layers.73.mlp.shared_experts.up_proj", - "model.layers.73.mlp.shared_experts.down_proj", - "model.layers.74.mlp.shared_experts.gate_proj", - "model.layers.74.mlp.shared_experts.up_proj", - "model.layers.74.mlp.shared_experts.down_proj", - "model.layers.75.mlp.shared_experts.gate_proj", - "model.layers.75.mlp.shared_experts.up_proj", - "model.layers.75.mlp.shared_experts.down_proj", - "model.layers.76.mlp.shared_experts.gate_proj", - "model.layers.76.mlp.shared_experts.up_proj", - "model.layers.76.mlp.shared_experts.down_proj", - "model.layers.77.mlp.shared_experts.gate_proj", - "model.layers.77.mlp.shared_experts.up_proj", - "model.layers.77.mlp.shared_experts.down_proj", - "model.layers.78.mlp.shared_experts.gate_proj", - "model.layers.78.mlp.shared_experts.up_proj", - "model.layers.78.mlp.shared_experts.down_proj", - "model.layers.79.mlp.shared_experts.gate_proj", - "model.layers.79.mlp.shared_experts.up_proj", - "model.layers.79.mlp.shared_experts.down_proj", - "model.layers.80.mlp.shared_experts.gate_proj", - "model.layers.80.mlp.shared_experts.up_proj", - "model.layers.80.mlp.shared_experts.down_proj", - "model.layers.81.mlp.shared_experts.gate_proj", - "model.layers.81.mlp.shared_experts.up_proj", - "model.layers.81.mlp.shared_experts.down_proj", - "model.layers.82.mlp.shared_experts.gate_proj", - "model.layers.82.mlp.shared_experts.up_proj", - "model.layers.82.mlp.shared_experts.down_proj", - "model.layers.83.mlp.shared_experts.gate_proj", - "model.layers.83.mlp.shared_experts.up_proj", - "model.layers.83.mlp.shared_experts.down_proj", - "model.layers.84.mlp.shared_experts.gate_proj", - "model.layers.84.mlp.shared_experts.up_proj", - "model.layers.84.mlp.shared_experts.down_proj", - "model.layers.85.mlp.shared_experts.gate_proj", - "model.layers.85.mlp.shared_experts.up_proj", - "model.layers.85.mlp.shared_experts.down_proj", - "model.layers.86.mlp.shared_experts.gate_proj", - "model.layers.86.mlp.shared_experts.up_proj", - "model.layers.86.mlp.shared_experts.down_proj", - "model.layers.87.mlp.shared_experts.gate_proj", - "model.layers.87.mlp.shared_experts.up_proj", - "model.layers.87.mlp.shared_experts.down_proj", - "model.layers.88.mlp.shared_experts.gate_proj", - "model.layers.88.mlp.shared_experts.up_proj", - "model.layers.88.mlp.shared_experts.down_proj", - "model.layers.89.mlp.shared_experts.gate_proj", - "model.layers.89.mlp.shared_experts.up_proj", - "model.layers.89.mlp.shared_experts.down_proj", - "model.layers.90.mlp.shared_experts.gate_proj", - "model.layers.90.mlp.shared_experts.up_proj", - "model.layers.90.mlp.shared_experts.down_proj", - "model.layers.91.mlp.shared_experts.gate_proj", - "model.layers.91.mlp.shared_experts.up_proj", - "model.layers.91.mlp.shared_experts.down_proj", + "re:.*model.layers.([3-9]|[1-8][0-9]|9[01]).mlp.shared_experts.(gate|up|down)_proj", # Ignore the output head "lm_head", From 5429a71a3092e55ed156f6e212af96374a5edf53 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 6 Jan 2026 11:53:42 -0600 Subject: [PATCH 08/17] Update the Test_Calib_glm4_moe.py with proper stub directory. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- tests/llmcompressor/modeling/test_calib_glm4_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py index 4f4449b3c2..992f0c53c6 100644 --- a/tests/llmcompressor/modeling/test_calib_glm4_moe.py +++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py @@ -22,7 +22,7 @@ @requires_cadence("weekly") -@pytest.mark.parametrize("model_stub", ["THUDM/glm-4-9b-chat"]) # Update with actual GLM4 MoE model stub +@pytest.mark.parametrize("model_stub", ["zai-org/GLM-4.7"]) # Update with actual GLM4 MoE model stub def test_calib_replace_glm4moe_all_experts(model_stub): with skip_weights_download(): model = AutoModelForCausalLM.from_pretrained(model_stub, trust_remote_code=True) From 8caaf284c657b4965a12d5293962a688dc72b59a Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 6 Jan 2026 12:07:00 -0600 Subject: [PATCH 09/17] Updating the Example script to use argument paramaters at script launch instead of a .env file. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4.7_example.py | 41 +++++++++++------------ 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4.7_example.py index 35dfca4a4b..45fa903130 100644 --- a/examples/quantizing_moe/glm4.7_example.py +++ b/examples/quantizing_moe/glm4.7_example.py @@ -1,5 +1,4 @@ -import os -from pathlib import Path +import argparse from datasets import load_dataset, concatenate_datasets from transformers import AutoModelForCausalLM, AutoTokenizer @@ -11,35 +10,33 @@ # This script does W4A16 AWQ quantization of the GLM-4.7 model. It uses Group Size of 32 and two datasets (one specific for quantization and one for reasoning models) # Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB of VRAM used and roughly ~3.5 hours of run time. # This model script uses the glm4 modeling file to make sure that for each calibration sample, all experts are engaged. -# This script also uses a local .ENV file, for Source and Destination. Change as needed. +# This script accepts command-line arguments for source and destination directories. # GLM 4.7 has Dense layers for the first three layers, so we skip multiple sections of those layers. We then need to add all of that to a mapping, to apply it during quantization. # ========================= -# Load ENV Variables +# Parse Command-Line Arguments # ========================= -from dotenv import load_dotenv - -# Load the .env that sits next to this script (works regardless of where you run it) -# The .env file should be in the directory this script is run from and should look like the following: -# SRC_DIR=/media/fmodels/zai-org/GLM-4.7/ -# DST_DIR=/media/fmodels/TheHouseOfTheDude/GLM-4.7_Compressed-Tensors/W4A16_GS32 -# Those two lines are all that's needed. -load_dotenv(Path(__file__).with_name(".env")) - -def require_env(key: str) -> str: - val = os.getenv(key) - if not val or not val.strip(): - raise RuntimeError(f"Missing environment variable: {key}") - return val.strip() +parser = argparse.ArgumentParser(description="Run W4A16 AWQ quantization on GLM-4.7 model.") +parser.add_argument( + "model_path", + type=str, + help="Path to the source model directory." +) +parser.add_argument( + "output_path", + type=str, + help="Path to the destination directory for saving quantized model." +) -SRC_DIR = require_env("SRC_DIR") -DST_DIR = require_env("DST_DIR") +args = parser.parse_args() +model_path = args.model_path +output_path = args.output_path # ========================= # Model (GLM / GLM-MoE) # ========================= -MODEL_ID = require_env("SRC_DIR") +MODEL_ID = model_path model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) @@ -196,7 +193,7 @@ def preprocess_rombo(batch): # ========================= # Quantize + save (writes quantization_config for vLLM) # ========================= -SAVE_DIR = require_env("DST_DIR") +SAVE_DIR = output_path oneshot( model=model, From 9d3618beb0ea79a683074789c27f5d4f9e0045aa Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 6 Jan 2026 15:06:28 -0600 Subject: [PATCH 10/17] Address the items identified during quality check. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- .../{glm4.7_example.py => glm4_7_example.py} | 52 +++++++++++++------ src/llmcompressor/modeling/glm4_moe.py | 8 ++- .../modeling/test_calib_glm4_moe.py | 2 +- 3 files changed, 39 insertions(+), 23 deletions(-) rename examples/quantizing_moe/{glm4.7_example.py => glm4_7_example.py} (86%) diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4_7_example.py similarity index 86% rename from examples/quantizing_moe/glm4.7_example.py rename to examples/quantizing_moe/glm4_7_example.py index 45fa903130..fb82039334 100644 --- a/examples/quantizing_moe/glm4.7_example.py +++ b/examples/quantizing_moe/glm4_7_example.py @@ -1,23 +1,32 @@ import argparse -from datasets import load_dataset, concatenate_datasets +from datasets import concatenate_datasets, load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers.awq import AWQModifier, AWQMapping -from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE - -# This script does W4A16 AWQ quantization of the GLM-4.7 model. It uses Group Size of 32 and two datasets (one specific for quantization and one for reasoning models) -# Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB of VRAM used and roughly ~3.5 hours of run time. -# This model script uses the glm4 modeling file to make sure that for each calibration sample, all experts are engaged. -# This script accepts command-line arguments for source and destination directories. -# GLM 4.7 has Dense layers for the first three layers, so we skip multiple sections of those layers. We then need to add all of that to a mapping, to apply it during quantization. +from llmcompressor.modifiers.awq import AWQMapping, AWQModifier +from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE # noqa: F401 + +# This script does W4A16 AWQ quantization of the GLM-4.7 model. +# It uses Group Size of 32 and two datasets (one specific for quantization +# and one for reasoning models). +# Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB +# of VRAM used and roughly ~3.5 hours of run time. +# This model script uses the glm4 modeling file to make sure that for each +# calibration sample, all experts are engaged. +# This script accepts command-line arguments for source and destination +# directories. +# GLM 4.7 has Dense layers for the first three layers, so we skip multiple +# sections of those layers. We then need to add all of that to a mapping, +# to apply it during quantization. # ========================= # Parse Command-Line Arguments # ========================= -parser = argparse.ArgumentParser(description="Run W4A16 AWQ quantization on GLM-4.7 model.") +parser = argparse.ArgumentParser( + description="Run W4A16 AWQ quantization on GLM-4.7 model." +) parser.add_argument( "model_path", type=str, @@ -50,7 +59,10 @@ NUM_NEURALMAGIC = int(NUM_CALIBRATION_SAMPLES * 0.6) # ~307 samples NUM_ROMBO = NUM_CALIBRATION_SAMPLES - NUM_NEURALMAGIC # ~205 samples -print(f"Loading calibration datasets: {NUM_NEURALMAGIC} from Neural Magic, {NUM_ROMBO} from Rombo") +print( + f"Loading calibration datasets: {NUM_NEURALMAGIC} from Neural Magic, " + f"{NUM_ROMBO} from Rombo" +) # Load Neural Magic dataset neuralmagic_dataset_id = "neuralmagic/LLM_compression_calibration" @@ -90,23 +102,25 @@ def preprocess_neuralmagic(batch): # Format: {"instruction": "", "input": [""], "output": [""]} def preprocess_rombo(batch): rendered = [] - for instruction, inputs, outputs in zip(batch["instruction"], batch["input"], batch["output"]): + for instruction, inputs, outputs in zip( + batch["instruction"], batch["input"], batch["output"] + ): # Construct text from instruction, input, and output # Combine instruction with all input/output pairs text_parts = [instruction] - + # Handle input array (may contain multiple items) if isinstance(inputs, list) and len(inputs) > 0: for inp in inputs: if inp and inp.strip(): text_parts.append(f"\n\nInput: {inp}") - + # Handle output array (may contain multiple items) if isinstance(outputs, list) and len(outputs) > 0: for out in outputs: if out and out.strip(): text_parts.append(f"\n\nOutput: {out}") - + # Join all parts text = "".join(text_parts) rendered.append(text) @@ -206,8 +220,12 @@ def preprocess_rombo(batch): # Fix generation config validation issue before saving if hasattr(model, 'generation_config') and model.generation_config is not None: - # If temperature is set but do_sample is False, either enable do_sample or remove temperature - if hasattr(model.generation_config, 'temperature') and model.generation_config.temperature is not None: + # If temperature is set but do_sample is False, either enable do_sample + # or remove temperature + if ( + hasattr(model.generation_config, 'temperature') + and model.generation_config.temperature is not None + ): if not getattr(model.generation_config, 'do_sample', False): # Set do_sample=True to make temperature valid, or remove temperature model.generation_config.do_sample = True diff --git a/src/llmcompressor/modeling/glm4_moe.py b/src/llmcompressor/modeling/glm4_moe.py index c33dc878b3..b588534934 100644 --- a/src/llmcompressor/modeling/glm4_moe.py +++ b/src/llmcompressor/modeling/glm4_moe.py @@ -11,7 +11,6 @@ class CalibrationGlm4MoeMoE(MoECalibrationModule): """ Calibration version of Glm4MoeMoE that sends all tokens to all experts. - During calibration, when calibrate_all_experts=True, all tokens are sent to all experts to ensure proper quantization statistics are collected for every expert, not just those activated by the calibration data routing. @@ -35,12 +34,10 @@ def __init__( def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ Forward pass with optional calibration mode. - When calibrate_all_experts=True: - All tokens are sent to all experts for calibration - Routing weights are still used for final output combination - This ensures all experts see calibration data - When calibrate_all_experts=False: - Normal MoE routing behavior (only routed tokens go to each expert) """ @@ -66,7 +63,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # The output is still calculated using only the routed tokens. expert_output_full = expert(hidden_states) if not has_tokens: - continue # No tokens routed to this expert, but stats were gathered. + # No tokens routed to this expert, but stats were gathered. + continue expert_output = expert_output_full[token_indices] else: # Standard MoE behavior: only process tokens routed to this expert. @@ -87,7 +85,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: def restore(self, original: torch.nn.Module) -> torch.nn.Module: """ Restore the original module structure. - + Since is_permanent=False, this method is called when exiting the calibration context to restore the original MoE module. """ diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py index 992f0c53c6..91858ca6c4 100644 --- a/tests/llmcompressor/modeling/test_calib_glm4_moe.py +++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py @@ -22,7 +22,7 @@ @requires_cadence("weekly") -@pytest.mark.parametrize("model_stub", ["zai-org/GLM-4.7"]) # Update with actual GLM4 MoE model stub +@pytest.mark.parametrize("model_stub", ["zai-org/GLM-4.7"]) def test_calib_replace_glm4moe_all_experts(model_stub): with skip_weights_download(): model = AutoModelForCausalLM.from_pretrained(model_stub, trust_remote_code=True) From 3ffa14d6e5a64d0fd4cdcf08cad3af64bd5eae1e Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Wed, 7 Jan 2026 14:05:08 -0600 Subject: [PATCH 11/17] Updating the order of datasets import. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4_7_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py index fb82039334..3726b7f989 100644 --- a/examples/quantizing_moe/glm4_7_example.py +++ b/examples/quantizing_moe/glm4_7_example.py @@ -1,6 +1,6 @@ import argparse -from datasets import concatenate_datasets, load_dataset +from datasets import load_dataset, concatenate_datasets from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot From e2e1b77ada47a26383e4574cbedf7b84a861a764 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 13 Jan 2026 11:30:31 -0600 Subject: [PATCH 12/17] Removed AWQMappings and utilized AWQModifier. Also updated auto_dtype to just dtype. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4_7_example.py | 26 ++++++----------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py index 3726b7f989..bf7a512b8e 100644 --- a/examples/quantizing_moe/glm4_7_example.py +++ b/examples/quantizing_moe/glm4_7_example.py @@ -4,7 +4,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers.awq import AWQMapping, AWQModifier +from llmcompressor.modifiers.awq import AWQModifier from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE # noqa: F401 # This script does W4A16 AWQ quantization of the GLM-4.7 model. @@ -46,7 +46,7 @@ # Model (GLM / GLM-MoE) # ========================= MODEL_ID = model_path -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) # ========================= @@ -159,9 +159,10 @@ def preprocess_rombo(batch): # ========================= moe_ignores = [ - # Layers 0-2: Dense layer - ignore attention and MLP - "model.layers.[0-2].self_attn.(q|k|v|o)_proj", - "model.layers.[0-2].mlp.(gate|up|down)_proj", + # Layers 0-2: Dense layer - ignore entire layers + "model.layers.0.*", + "model.layers.1.*", + "model.layers.2.*", # Layers 3-91: MoE layers - ignore shared_experts "re:.*model.layers.([3-9]|[1-8][0-9]|9[01]).mlp.shared_experts.(gate|up|down)_proj", @@ -170,24 +171,9 @@ def preprocess_rombo(batch): "lm_head", ] -# Create explicit mappings that skip layers 0-2 -mappings = [] -for layer_idx in range(3, 92): # Skip dense layers 0-2 - mappings.append( - AWQMapping( - smooth_layer=f"model.layers.{layer_idx}.input_layernorm", - balance_layers=[ - f"model.layers.{layer_idx}.self_attn.q_proj", - f"model.layers.{layer_idx}.self_attn.k_proj", - f"model.layers.{layer_idx}.self_attn.v_proj", - ] - ) - ) - recipe = [ AWQModifier( ignore=moe_ignores, - mappings=mappings, # Provide explicit mappings config_groups={ "group_0": { "targets": ["Linear"], From db7de05742d139738711659a4b59d274e1098572 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 13 Jan 2026 11:37:47 -0600 Subject: [PATCH 13/17] Created a helper function for fixing the generation config. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4_7_example.py | 33 ++++++++++++++++------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py index bf7a512b8e..dc4c76b0f2 100644 --- a/examples/quantizing_moe/glm4_7_example.py +++ b/examples/quantizing_moe/glm4_7_example.py @@ -21,6 +21,28 @@ # to apply it during quantization. +def fix_generation_config(model): + """ + Fix generation config validation issue before saving. + + If temperature is set but do_sample is False, enable do_sample=True + to make temperature valid. This prevents validation errors when saving + models with generation configs. + + :param model: The model to fix generation_config for + """ + if hasattr(model, 'generation_config') and model.generation_config is not None: + # If temperature is set but do_sample is False, either enable do_sample + # or remove temperature + if ( + hasattr(model.generation_config, 'temperature') + and model.generation_config.temperature is not None + ): + if not getattr(model.generation_config, 'do_sample', False): + # Set do_sample=True to make temperature valid, or remove temperature + model.generation_config.do_sample = True + + # ========================= # Parse Command-Line Arguments # ========================= @@ -205,16 +227,7 @@ def preprocess_rombo(batch): ) # Fix generation config validation issue before saving -if hasattr(model, 'generation_config') and model.generation_config is not None: - # If temperature is set but do_sample is False, either enable do_sample - # or remove temperature - if ( - hasattr(model.generation_config, 'temperature') - and model.generation_config.temperature is not None - ): - if not getattr(model.generation_config, 'do_sample', False): - # Set do_sample=True to make temperature valid, or remove temperature - model.generation_config.do_sample = True +fix_generation_config(model) # (Optional redundant save) model.save_pretrained(SAVE_DIR, save_compressed=True) From 19be730b7f26adaf29ee2888d89d71f982d14869 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 20 Jan 2026 11:01:48 -0500 Subject: [PATCH 14/17] Simplified the GLM4_7 Example script. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4_7_example.py | 258 ++++------------------ 1 file changed, 47 insertions(+), 211 deletions(-) diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py index dc4c76b0f2..820b5d9e6e 100644 --- a/examples/quantizing_moe/glm4_7_example.py +++ b/examples/quantizing_moe/glm4_7_example.py @@ -1,237 +1,73 @@ -import argparse - -from datasets import load_dataset, concatenate_datasets +from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.awq import AWQModifier -from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE # noqa: F401 - -# This script does W4A16 AWQ quantization of the GLM-4.7 model. -# It uses Group Size of 32 and two datasets (one specific for quantization -# and one for reasoning models). -# Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB -# of VRAM used and roughly ~3.5 hours of run time. -# This model script uses the glm4 modeling file to make sure that for each -# calibration sample, all experts are engaged. -# This script accepts command-line arguments for source and destination -# directories. -# GLM 4.7 has Dense layers for the first three layers, so we skip multiple -# sections of those layers. We then need to add all of that to a mapping, -# to apply it during quantization. - - -def fix_generation_config(model): - """ - Fix generation config validation issue before saving. - - If temperature is set but do_sample is False, enable do_sample=True - to make temperature valid. This prevents validation errors when saving - models with generation configs. - - :param model: The model to fix generation_config for - """ - if hasattr(model, 'generation_config') and model.generation_config is not None: - # If temperature is set but do_sample is False, either enable do_sample - # or remove temperature - if ( - hasattr(model.generation_config, 'temperature') - and model.generation_config.temperature is not None - ): - if not getattr(model.generation_config, 'do_sample', False): - # Set do_sample=True to make temperature valid, or remove temperature - model.generation_config.do_sample = True - - -# ========================= -# Parse Command-Line Arguments -# ========================= -parser = argparse.ArgumentParser( - description="Run W4A16 AWQ quantization on GLM-4.7 model." -) -parser.add_argument( - "model_path", - type=str, - help="Path to the source model directory." -) -parser.add_argument( - "output_path", - type=str, - help="Path to the destination directory for saving quantized model." -) - -args = parser.parse_args() -model_path = args.model_path -output_path = args.output_path - -# ========================= -# Model (GLM / GLM-MoE) -# ========================= -MODEL_ID = model_path -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) - -# ========================= -# Calibration data (Neural Magic + Rombo Optimized Reasoning) -# ========================= -NUM_CALIBRATION_SAMPLES = 512 +from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE + +# Load the model +model_id = "zai-org/GLM-4.7" +model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) +# MoE calibration is now handled automatically by the pipeline. +# The `CalibrationGlm4MoeMoE` modules (from `llmcompressor.modeling.glm4_moe`) +# will be applied during calibration to enable proper expert calibration. +# These replace the original `Glm4MoeMoE` class from +# `transformers.models.glm4_moe.modeling_glm4_moe`. + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 5 MAX_SEQUENCE_LENGTH = 2048 -# Calculate sample distribution: 60% Neural Magic, 40% Rombo -NUM_NEURALMAGIC = int(NUM_CALIBRATION_SAMPLES * 0.6) # ~307 samples -NUM_ROMBO = NUM_CALIBRATION_SAMPLES - NUM_NEURALMAGIC # ~205 samples +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) -print( - f"Loading calibration datasets: {NUM_NEURALMAGIC} from Neural Magic, " - f"{NUM_ROMBO} from Rombo" -) -# Load Neural Magic dataset -neuralmagic_dataset_id = "neuralmagic/LLM_compression_calibration" -neuralmagic_split = "train" -ds_neuralmagic = load_dataset(neuralmagic_dataset_id, split=neuralmagic_split) - -# Sample from Neural Magic dataset -n_nm = min(NUM_NEURALMAGIC, len(ds_neuralmagic)) -ds_neuralmagic = ds_neuralmagic.shuffle(seed=42).select(range(n_nm)) - -# Render messages to chat-style text (batch) -# The neuralmagic dataset has "messages" field with user/assistant roles -def preprocess_neuralmagic(batch): - rendered = [] - for messages in batch["messages"]: - # Apply chat template to the messages directly - text = tokenizer.apply_chat_template( - messages, +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], tokenize=False, - add_generation_prompt=False, ) - rendered.append(text) - return {"text": rendered} - -ds_neuralmagic = ds_neuralmagic.map(preprocess_neuralmagic, batched=True, num_proc=4) - -# Load Rombo Optimized Reasoning dataset -rombo_dataset_id = "Rombo-Org/Optimized_Reasoning" -rombo_split = "train" -ds_rombo = load_dataset(rombo_dataset_id, split=rombo_split) - -# Sample from Rombo dataset -n_rombo = min(NUM_ROMBO, len(ds_rombo)) -ds_rombo = ds_rombo.shuffle(seed=43).select(range(n_rombo)) - -# Preprocess Rombo dataset -# Format: {"instruction": "", "input": [""], "output": [""]} -def preprocess_rombo(batch): - rendered = [] - for instruction, inputs, outputs in zip( - batch["instruction"], batch["input"], batch["output"] - ): - # Construct text from instruction, input, and output - # Combine instruction with all input/output pairs - text_parts = [instruction] - - # Handle input array (may contain multiple items) - if isinstance(inputs, list) and len(inputs) > 0: - for inp in inputs: - if inp and inp.strip(): - text_parts.append(f"\n\nInput: {inp}") - - # Handle output array (may contain multiple items) - if isinstance(outputs, list) and len(outputs) > 0: - for out in outputs: - if out and out.strip(): - text_parts.append(f"\n\nOutput: {out}") - - # Join all parts - text = "".join(text_parts) - rendered.append(text) - return {"text": rendered} - -ds_rombo = ds_rombo.map(preprocess_rombo, batched=True, num_proc=4) - -# Combine both datasets -ds = concatenate_datasets([ds_neuralmagic, ds_rombo]) - -# Shuffle the combined dataset -ds = ds.shuffle(seed=44) - -# Tokenize in batches -ds = ds.map( - lambda batch: tokenizer( - batch["text"], + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False, - ), - batched=True, - remove_columns=ds.column_names, - num_proc=4, -) + ) + -print(f"Combined calibration dataset: {len(ds)} samples") - -# ========================= -# AWQ recipe with config_groups -# - Weight-only INT4 (W4A16 **symmetric**) -# - group_size: 32 -# - IMPORTANT: do NOT ignore mlp.gate / gate_up_proj (merged layer) -# - Keep router and output head unquantized -# ========================= - -moe_ignores = [ - # Layers 0-2: Dense layer - ignore entire layers - "model.layers.0.*", - "model.layers.1.*", - "model.layers.2.*", - - # Layers 3-91: MoE layers - ignore shared_experts - "re:.*model.layers.([3-9]|[1-8][0-9]|9[01]).mlp.shared_experts.(gate|up|down)_proj", - - # Ignore the output head - "lm_head", -] - -recipe = [ - AWQModifier( - ignore=moe_ignores, - config_groups={ - "group_0": { - "targets": ["Linear"], - "weights": { - "num_bits": 4, - "type": "int", - "symmetric": True, # W4A16 (symmetric) - "strategy": "group", - "group_size": 32, - "dynamic": False, - }, - }, - }, - ), -] - -# ========================= -# Quantize + save (writes quantization_config for vLLM) -# ========================= -SAVE_DIR = output_path +ds = ds.map(tokenize, remove_columns=ds.column_names) +# Configure the quantization algorithm to run. +# * quantize the weights to 4 bit with GPTQ with a group size 128 +recipe = AWQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) + +# Apply algorithms. oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, -# output_dir=SAVE_DIR, ) -# Fix generation config validation issue before saving -fix_generation_config(model) - -# (Optional redundant save) +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) - -print("Saved to:", SAVE_DIR) - From 014cfea833b929c31d47d27db5c5a85557d1b0e9 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 20 Jan 2026 11:11:11 -0500 Subject: [PATCH 15/17] Adding MOE_ignore layers back into Simplified Script. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4_7_example.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py index 820b5d9e6e..0b4bb212ec 100644 --- a/examples/quantizing_moe/glm4_7_example.py +++ b/examples/quantizing_moe/glm4_7_example.py @@ -21,7 +21,7 @@ # Select number of samples. 512 samples is a good place to start. # Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 5 +NUM_CALIBRATION_SAMPLES = 512 MAX_SEQUENCE_LENGTH = 2048 # Load dataset and preprocess. @@ -54,9 +54,19 @@ def tokenize(sample): ds = ds.map(tokenize, remove_columns=ds.column_names) +moe_ignores = [ + # Layers 0-2: Dense layer - ignore entire layers + "model.layers.0.*", + "model.layers.1.*", + "model.layers.2.*", + + # Ignore the output head + "lm_head", +] + # Configure the quantization algorithm to run. # * quantize the weights to 4 bit with GPTQ with a group size 128 -recipe = AWQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) +recipe = AWQModifier(targets="Linear", scheme="W4A16", ignore=moe_ignores) # Apply algorithms. oneshot( From 4f0bb663cce0f6f91089512032a48fa178599ac6 Mon Sep 17 00:00:00 2001 From: phaelon74 <33295008+phaelon74@users.noreply.github.com> Date: Tue, 20 Jan 2026 12:27:31 -0500 Subject: [PATCH 16/17] Fixing Import Order. Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com> --- examples/quantizing_moe/glm4_7_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py index 0b4bb212ec..247d992e36 100644 --- a/examples/quantizing_moe/glm4_7_example.py +++ b/examples/quantizing_moe/glm4_7_example.py @@ -3,7 +3,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.awq import AWQModifier -from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE +from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE # noqa: F401 # Load the model model_id = "zai-org/GLM-4.7" From 083c35bd91b4b3022f091dea14a9c290217f903d Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 20 Jan 2026 13:43:10 -0500 Subject: [PATCH 17/17] Format --- examples/quantizing_moe/glm4_7_example.py | 3 +-- src/llmcompressor/modeling/glm4_moe.py | 1 - tests/llmcompressor/modeling/test_calib_glm4_moe.py | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py index 247d992e36..c3ed6ef8c1 100644 --- a/examples/quantizing_moe/glm4_7_example.py +++ b/examples/quantizing_moe/glm4_7_example.py @@ -2,8 +2,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers.awq import AWQModifier from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE # noqa: F401 +from llmcompressor.modifiers.awq import AWQModifier # Load the model model_id = "zai-org/GLM-4.7" @@ -59,7 +59,6 @@ def tokenize(sample): "model.layers.0.*", "model.layers.1.*", "model.layers.2.*", - # Ignore the output head "lm_head", ] diff --git a/src/llmcompressor/modeling/glm4_moe.py b/src/llmcompressor/modeling/glm4_moe.py index b588534934..4f4e470d50 100644 --- a/src/llmcompressor/modeling/glm4_moe.py +++ b/src/llmcompressor/modeling/glm4_moe.py @@ -90,4 +90,3 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module: the calibration context to restore the original MoE module. """ return original - diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py index 91858ca6c4..8ca1c92cb0 100644 --- a/tests/llmcompressor/modeling/test_calib_glm4_moe.py +++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py @@ -89,4 +89,3 @@ def test_calib_glm4moe_module(): with calibration_forward_context(module): output = module(sample) assert torch.allclose(true_output, output, atol=1e-6) -