From 7db5f4efed7f0287396ef956813016f09a3bc7b5 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Wed, 24 Dec 2025 18:12:53 -0600
Subject: [PATCH 01/17] Added GLM Modeling. Signed-off-by: phaelon74
 <33295008+phaelon74@users.noreply.github.com>

---
 src/llmcompressor/modeling/__init__.py |  1 +
 src/llmcompressor/modeling/glm4_moe.py | 98 ++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 src/llmcompressor/modeling/glm4_moe.py

diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
index ef04896e06..d7cd3f24ae 100644
--- a/src/llmcompressor/modeling/__init__.py
+++ b/src/llmcompressor/modeling/__init__.py
@@ -11,6 +11,7 @@
 
 # trigger registration
 from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
+from .glm4_moe import CalibrationGlm4MoeMoE  # noqa: F401
 from .llama4 import SequentialLlama4TextMoe  # noqa: F401
 from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
 from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
diff --git a/src/llmcompressor/modeling/glm4_moe.py b/src/llmcompressor/modeling/glm4_moe.py
new file mode 100644
index 0000000000..4e658f1f71
--- /dev/null
+++ b/src/llmcompressor/modeling/glm4_moe.py
@@ -0,0 +1,98 @@
+import torch
+from transformers.models.glm4_moe.configuration_glm4_moe import Glm4MoeConfig
+from transformers.models.glm4_moe.modeling_glm4_moe import (
+    Glm4MoeMoE as OriginalGlm4MoeMoE,
+)
+
+from llmcompressor.modeling.moe_context import MoECalibrationModule
+
+
+@MoECalibrationModule.register("Glm4MoeMoE")
+class CalibrationGlm4MoeMoE(MoECalibrationModule):
+    """
+    Calibration version of Glm4MoeMoE that sends all tokens to all experts.
+    
+    During calibration, when calibrate_all_experts=True, all tokens are sent to
+    all experts to ensure proper quantization statistics are collected for every
+    expert, not just those activated by the calibration data routing.
+    """
+
+    is_permanent = False
+
+    def __init__(
+        self,
+        original: OriginalGlm4MoeMoE,
+        config: Glm4MoeConfig,
+        calibrate_all_experts: bool = True,
+    ):
+        super().__init__()
+        self.config = config
+        self.experts = original.experts
+        self.gate = original.gate
+        self.shared_experts = original.shared_experts
+        self.calibrate_all_experts = calibrate_all_experts
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass with optional calibration mode.
+        
+        When calibrate_all_experts=True:
+            - All tokens are sent to all experts for calibration
+            - Routing weights are still used for final output combination
+            - This ensures all experts see calibration data
+        
+        When calibrate_all_experts=False:
+            - Normal MoE routing behavior (only routed tokens go to each expert)
+        """
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+        # Begin MoE - inline the moe() method logic with calibration support
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(
+            topk_indices, num_classes=len(self.experts)
+        )
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx, expert in enumerate(self.experts):
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+            has_tokens = token_indices.numel() > 0
+
+            if self.calibrate_all_experts:
+                # Send all tokens to this expert for calibration
+                expert_input = hidden_states
+                expert_output = expert(expert_input)
+
+                if has_tokens:
+                    # Still use routing weights for final output combination
+                    expert_weights = topk_weights[token_indices, weight_indices]
+                    weighted_output = expert_output[
+                        token_indices
+                    ] * expert_weights.unsqueeze(-1)
+                    final_hidden_states.index_add_(0, token_indices, weighted_output)
+            else:
+                # Normal MoE: only process tokens routed to this expert
+                if has_tokens:
+                    expert_input = hidden_states[token_indices]
+                    expert_output = expert(expert_input)
+                    expert_weights = topk_weights[token_indices, weight_indices]
+                    weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                    final_hidden_states.index_add_(0, token_indices, weighted_output)
+        # End MoE
+
+        hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+    def restore(self, original: torch.nn.Module) -> torch.nn.Module:
+        """
+        Restore the original module structure.
+        
+        Since is_permanent=False, this method is called when exiting
+        the calibration context to restore the original MoE module.
+        """
+        return original
+

From e1e7b5893cdc11f049b3b52d4f2741e5ddb7080d Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Sat, 27 Dec 2025 08:15:24 -0600
Subject: [PATCH 02/17] Adding the test file for GLM-MoE. Signed-off-by:
 phaelon74 <33295008+phaelon74@users.noreply.github.com>

---
 .../modeling/test_calib_glm4_moe.py           | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 tests/llmcompressor/modeling/test_calib_glm4_moe.py

diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
new file mode 100644
index 0000000000..042dbd85df
--- /dev/null
+++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
@@ -0,0 +1,92 @@
+import contextlib
+from functools import partial
+
+import pytest
+import torch
+from transformers import AutoModelForCausalLM
+
+from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE
+from llmcompressor.modeling.moe_context import moe_calibration_context
+from llmcompressor.utils.dev import skip_weights_download
+from llmcompressor.utils.helpers import calibration_forward_context
+from tests.testing_utils import requires_cadence, requires_gpu
+
+Glm4MoeConfig = pytest.importorskip(
+    "transformers.models.glm4_moe.configuration_glm4_moe",
+    reason="Glm4MoeConfig not available in this version of transformers",
+).Glm4MoeConfig
+OriginalGlm4MoeMoE = pytest.importorskip(
+    "transformers.models.glm4_moe.modeling_glm4_moe",
+    reason="Glm4MoeMoE not available in this version of transformers",
+).Glm4MoeMoE
+
+
+@requires_cadence("weekly")
+@pytest.mark.parametrize("model_stub", ["THUDM/glm-4-9b-chat"])  # Update with actual GLM4 MoE model stub
+def test_calib_replace_glm4moe_all_experts(model_stub):
+    with skip_weights_download():
+        model = AutoModelForCausalLM.from_pretrained(model_stub)
+
+    with contextlib.ExitStack() as stack:
+        stack.enter_context(calibration_forward_context(model))
+        stack.enter_context(moe_calibration_context(model, calibrate_all_experts=True))
+
+        # Find a GLM4 MoE layer
+        moe_layer = None
+        for _, module in model.named_modules():
+            if isinstance(module, CalibrationGlm4MoeMoE):
+                moe_layer = module
+                break
+
+        assert moe_layer is not None
+
+        num_experts = len(moe_layer.experts)
+        expert_triggered = [False for _ in range(num_experts)]
+
+        # Define the hook function
+        def hook_fn(i, module, input, output):
+            expert_triggered[i] = True
+
+        # Attach hooks using functools.partial to bind each index
+        for i, expert in enumerate(moe_layer.experts):
+            expert.register_forward_hook(partial(hook_fn, i))
+
+        # Create dummy input tensor that simulates hidden_states
+        hidden_dim = model.config.hidden_size
+        batch, seq_len = 4, 32
+        sample = torch.randn(batch, seq_len, hidden_dim, dtype=torch.float32)
+
+        # Forward through the MoE layer directly
+        with torch.no_grad():
+            _ = moe_layer(sample)
+
+        # Assert all experts are used
+        assert all(
+            expert_triggered
+        ), f"Not all experts were triggered: {expert_triggered}"
+
+
+@requires_gpu
+def test_calib_glm4moe_module():
+    config = Glm4MoeConfig()
+    with torch.device("cuda"):
+        original = OriginalGlm4MoeMoE(config).eval()
+
+    # Create dummy input tensor that simulates hidden_states
+    hidden_dim = config.hidden_size
+    batch, seq_len = 4, 32
+    sample = torch.randn(batch, seq_len, hidden_dim, device="cuda")
+
+    with calibration_forward_context(original):
+        true_output = original(sample)
+
+    module = CalibrationGlm4MoeMoE(original, config, calibrate_all_experts=True)
+    with calibration_forward_context(module):
+        output = module(sample)
+        assert torch.allclose(true_output, output, atol=1e-6)
+
+    module = CalibrationGlm4MoeMoE(original, config, calibrate_all_experts=False)
+    with calibration_forward_context(module):
+        output = module(sample)
+        assert torch.allclose(true_output, output, atol=1e-6)
+

From a7b0e8acf5d0741d4b3be9e44da02e18814f39b4 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Sat, 27 Dec 2025 19:20:55 -0600
Subject: [PATCH 03/17] Adding Trust Remote Code for testing calibration.
 Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com>

---
 tests/llmcompressor/modeling/test_calib_glm4_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
index 042dbd85df..4f4449b3c2 100644
--- a/tests/llmcompressor/modeling/test_calib_glm4_moe.py
+++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
@@ -25,7 +25,7 @@
 @pytest.mark.parametrize("model_stub", ["THUDM/glm-4-9b-chat"])  # Update with actual GLM4 MoE model stub
 def test_calib_replace_glm4moe_all_experts(model_stub):
     with skip_weights_download():
-        model = AutoModelForCausalLM.from_pretrained(model_stub)
+        model = AutoModelForCausalLM.from_pretrained(model_stub, trust_remote_code=True)
 
     with contextlib.ExitStack() as stack:
         stack.enter_context(calibration_forward_context(model))

From f02b1bef3b721c32366134fd19036c163113e153 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Sat, 27 Dec 2025 21:09:10 -0600
Subject: [PATCH 04/17] Fixing the Loop Gemini identifed. Signed-off-by:
 phaelon74 <33295008+phaelon74@users.noreply.github.com>

---
 src/llmcompressor/modeling/glm4_moe.py | 33 ++++++++++++--------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/llmcompressor/modeling/glm4_moe.py b/src/llmcompressor/modeling/glm4_moe.py
index 4e658f1f71..c33dc878b3 100644
--- a/src/llmcompressor/modeling/glm4_moe.py
+++ b/src/llmcompressor/modeling/glm4_moe.py
@@ -62,25 +62,22 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             has_tokens = token_indices.numel() > 0
 
             if self.calibrate_all_experts:
-                # Send all tokens to this expert for calibration
-                expert_input = hidden_states
-                expert_output = expert(expert_input)
-
-                if has_tokens:
-                    # Still use routing weights for final output combination
-                    expert_weights = topk_weights[token_indices, weight_indices]
-                    weighted_output = expert_output[
-                        token_indices
-                    ] * expert_weights.unsqueeze(-1)
-                    final_hidden_states.index_add_(0, token_indices, weighted_output)
+                # When calibrating, run all tokens through the expert to gather stats.
+                # The output is still calculated using only the routed tokens.
+                expert_output_full = expert(hidden_states)
+                if not has_tokens:
+                    continue  # No tokens routed to this expert, but stats were gathered.
+                expert_output = expert_output_full[token_indices]
             else:
-                # Normal MoE: only process tokens routed to this expert
-                if has_tokens:
-                    expert_input = hidden_states[token_indices]
-                    expert_output = expert(expert_input)
-                    expert_weights = topk_weights[token_indices, weight_indices]
-                    weighted_output = expert_output * expert_weights.unsqueeze(-1)
-                    final_hidden_states.index_add_(0, token_indices, weighted_output)
+                # Standard MoE behavior: only process tokens routed to this expert.
+                if not has_tokens:
+                    continue
+                expert_output = expert(hidden_states[token_indices])
+
+            # Common logic for combining expert outputs
+            expert_weights = topk_weights[token_indices, weight_indices]
+            weighted_output = expert_output * expert_weights.unsqueeze(-1)
+            final_hidden_states.index_add_(0, token_indices, weighted_output)
         # End MoE
 
         hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape)

From c908f777513ff8549ec06ec11567f48cdeefe46c Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Mon, 5 Jan 2026 12:26:02 -0600
Subject: [PATCH 05/17] Adding example script for GLM-4.7 Quanting. 
 Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com>

---
 examples/quantizing_moe/glm4.7_example.py | 512 ++++++++++++++++++++++
 1 file changed, 512 insertions(+)
 create mode 100644 examples/quantizing_moe/glm4.7_example.py

diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4.7_example.py
new file mode 100644
index 0000000000..3c9c1b6ecf
--- /dev/null
+++ b/examples/quantizing_moe/glm4.7_example.py
@@ -0,0 +1,512 @@
+import os
+from pathlib import Path
+
+from datasets import load_dataset, concatenate_datasets
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
+from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE
+
+# This script does W4A16 AWQ quantization of the GLM-4.7 model.  It uses Group Size of 32 and two datasets (one specific for quantization and one for reasoning models)
+# Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB of VRAM used and roughly ~3.5 hours of run time.
+# This model script uses the glm4 modeling file to make sure that for each calibration sample, all experts are engaged.
+# This script also uses a local .ENV file, for Source and Destination.  Change as needed.
+# GLM 4.7 has Dense layers for the first three layers, so we skip multiple sections of those layers.  We then need to add all of that to a mapping, to apply it during quantization.
+
+
+# =========================
+# Load ENV Variables
+# =========================
+from dotenv import load_dotenv
+
+# Load the .env that sits next to this script (works regardless of where you run it)
+# The .env file should be in the directory this script is run from and should look like the following:
+# SRC_DIR=/media/fmodels/zai-org/GLM-4.7/
+# DST_DIR=/media/fmodels/TheHouseOfTheDude/GLM-4.7_Compressed-Tensors/W4A16_GS32
+# Those two lines are all that's needed.
+load_dotenv(Path(__file__).with_name(".env"))
+
+def require_env(key: str) -> str:
+    val = os.getenv(key)
+    if not val or not val.strip():
+        raise RuntimeError(f"Missing environment variable: {key}")
+    return val.strip()
+
+SRC_DIR = require_env("SRC_DIR")
+DST_DIR = require_env("DST_DIR")
+
+# =========================
+# Model (GLM / GLM-MoE)
+# =========================
+MODEL_ID = require_env("SRC_DIR")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+# =========================
+# Calibration data (Neural Magic + Rombo Optimized Reasoning)
+# =========================
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Calculate sample distribution: 60% Neural Magic, 40% Rombo
+NUM_NEURALMAGIC = int(NUM_CALIBRATION_SAMPLES * 0.6)  # ~307 samples
+NUM_ROMBO = NUM_CALIBRATION_SAMPLES - NUM_NEURALMAGIC  # ~205 samples
+
+print(f"Loading calibration datasets: {NUM_NEURALMAGIC} from Neural Magic, {NUM_ROMBO} from Rombo")
+
+# Load Neural Magic dataset
+neuralmagic_dataset_id = "neuralmagic/LLM_compression_calibration"
+neuralmagic_split = "train"
+ds_neuralmagic = load_dataset(neuralmagic_dataset_id, split=neuralmagic_split)
+
+# Sample from Neural Magic dataset
+n_nm = min(NUM_NEURALMAGIC, len(ds_neuralmagic))
+ds_neuralmagic = ds_neuralmagic.shuffle(seed=42).select(range(n_nm))
+
+# Render messages to chat-style text (batch)
+# The neuralmagic dataset has "messages" field with user/assistant roles
+def preprocess_neuralmagic(batch):
+    rendered = []
+    for messages in batch["messages"]:
+        # Apply chat template to the messages directly
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=False,
+        )
+        rendered.append(text)
+    return {"text": rendered}
+
+ds_neuralmagic = ds_neuralmagic.map(preprocess_neuralmagic, batched=True, num_proc=4)
+
+# Load Rombo Optimized Reasoning dataset
+rombo_dataset_id = "Rombo-Org/Optimized_Reasoning"
+rombo_split = "train"
+ds_rombo = load_dataset(rombo_dataset_id, split=rombo_split)
+
+# Sample from Rombo dataset
+n_rombo = min(NUM_ROMBO, len(ds_rombo))
+ds_rombo = ds_rombo.shuffle(seed=43).select(range(n_rombo))
+
+# Preprocess Rombo dataset
+# Format: {"instruction": "", "input": [""], "output": [""]}
+def preprocess_rombo(batch):
+    rendered = []
+    for instruction, inputs, outputs in zip(batch["instruction"], batch["input"], batch["output"]):
+        # Construct text from instruction, input, and output
+        # Combine instruction with all input/output pairs
+        text_parts = [instruction]
+        
+        # Handle input array (may contain multiple items)
+        if isinstance(inputs, list) and len(inputs) > 0:
+            for inp in inputs:
+                if inp and inp.strip():
+                    text_parts.append(f"\n\nInput: {inp}")
+        
+        # Handle output array (may contain multiple items)
+        if isinstance(outputs, list) and len(outputs) > 0:
+            for out in outputs:
+                if out and out.strip():
+                    text_parts.append(f"\n\nOutput: {out}")
+        
+        # Join all parts
+        text = "".join(text_parts)
+        rendered.append(text)
+    return {"text": rendered}
+
+ds_rombo = ds_rombo.map(preprocess_rombo, batched=True, num_proc=4)
+
+# Combine both datasets
+ds = concatenate_datasets([ds_neuralmagic, ds_rombo])
+
+# Shuffle the combined dataset
+ds = ds.shuffle(seed=44)
+
+# Tokenize in batches
+ds = ds.map(
+    lambda batch: tokenizer(
+        batch["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    ),
+    batched=True,
+    remove_columns=ds.column_names,
+    num_proc=4,
+)
+
+print(f"Combined calibration dataset: {len(ds)} samples")
+
+# =========================
+# AWQ recipe with config_groups
+#  - Weight-only INT4 (W4A16 **symmetric**)
+#  - group_size: 32
+#  - IMPORTANT: do NOT ignore mlp.gate / gate_up_proj (merged layer)
+#  - Keep router and output head unquantized
+# =========================
+
+moe_ignores = [
+    # Layer 0: Dense layer - ignore attention and MLP
+    "model.layers.0.self_attn.q_proj",
+    "model.layers.0.self_attn.k_proj",
+    "model.layers.0.self_attn.v_proj",
+    "model.layers.0.self_attn.o_proj",
+    "model.layers.0.mlp.gate_proj",
+    "model.layers.0.mlp.up_proj",
+    "model.layers.0.mlp.down_proj",
+
+    # Layer 1: Dense layer - ignore attention and MLP
+    "model.layers.1.self_attn.q_proj",
+    "model.layers.1.self_attn.k_proj",
+    "model.layers.1.self_attn.v_proj",
+    "model.layers.1.self_attn.o_proj",
+    "model.layers.1.mlp.gate_proj",
+    "model.layers.1.mlp.up_proj",
+    "model.layers.1.mlp.down_proj",
+
+    # Layer 2: Dense layer - ignore attention and MLP
+    "model.layers.2.self_attn.q_proj",
+    "model.layers.2.self_attn.k_proj",
+    "model.layers.2.self_attn.v_proj",
+    "model.layers.2.self_attn.o_proj",
+    "model.layers.2.mlp.gate_proj",
+    "model.layers.2.mlp.up_proj",
+    "model.layers.2.mlp.down_proj",
+
+    # Layers 3-91: MoE layers - ignore shared_experts
+    "model.layers.3.mlp.shared_experts.gate_proj",
+    "model.layers.3.mlp.shared_experts.up_proj",
+    "model.layers.3.mlp.shared_experts.down_proj",
+    "model.layers.4.mlp.shared_experts.gate_proj",
+    "model.layers.4.mlp.shared_experts.up_proj",
+    "model.layers.4.mlp.shared_experts.down_proj",
+    "model.layers.5.mlp.shared_experts.gate_proj",
+    "model.layers.5.mlp.shared_experts.up_proj",
+    "model.layers.5.mlp.shared_experts.down_proj",
+    "model.layers.6.mlp.shared_experts.gate_proj",
+    "model.layers.6.mlp.shared_experts.up_proj",
+    "model.layers.6.mlp.shared_experts.down_proj",
+    "model.layers.7.mlp.shared_experts.gate_proj",
+    "model.layers.7.mlp.shared_experts.up_proj",
+    "model.layers.7.mlp.shared_experts.down_proj",
+    "model.layers.8.mlp.shared_experts.gate_proj",
+    "model.layers.8.mlp.shared_experts.up_proj",
+    "model.layers.8.mlp.shared_experts.down_proj",
+    "model.layers.9.mlp.shared_experts.gate_proj",
+    "model.layers.9.mlp.shared_experts.up_proj",
+    "model.layers.9.mlp.shared_experts.down_proj",
+    "model.layers.10.mlp.shared_experts.gate_proj",
+    "model.layers.10.mlp.shared_experts.up_proj",
+    "model.layers.10.mlp.shared_experts.down_proj",
+    "model.layers.11.mlp.shared_experts.gate_proj",
+    "model.layers.11.mlp.shared_experts.up_proj",
+    "model.layers.11.mlp.shared_experts.down_proj",
+    "model.layers.12.mlp.shared_experts.gate_proj",
+    "model.layers.12.mlp.shared_experts.up_proj",
+    "model.layers.12.mlp.shared_experts.down_proj",
+    "model.layers.13.mlp.shared_experts.gate_proj",
+    "model.layers.13.mlp.shared_experts.up_proj",
+    "model.layers.13.mlp.shared_experts.down_proj",
+    "model.layers.14.mlp.shared_experts.gate_proj",
+    "model.layers.14.mlp.shared_experts.up_proj",
+    "model.layers.14.mlp.shared_experts.down_proj",
+    "model.layers.15.mlp.shared_experts.gate_proj",
+    "model.layers.15.mlp.shared_experts.up_proj",
+    "model.layers.15.mlp.shared_experts.down_proj",
+    "model.layers.16.mlp.shared_experts.gate_proj",
+    "model.layers.16.mlp.shared_experts.up_proj",
+    "model.layers.16.mlp.shared_experts.down_proj",
+    "model.layers.17.mlp.shared_experts.gate_proj",
+    "model.layers.17.mlp.shared_experts.up_proj",
+    "model.layers.17.mlp.shared_experts.down_proj",
+    "model.layers.18.mlp.shared_experts.gate_proj",
+    "model.layers.18.mlp.shared_experts.up_proj",
+    "model.layers.18.mlp.shared_experts.down_proj",
+    "model.layers.19.mlp.shared_experts.gate_proj",
+    "model.layers.19.mlp.shared_experts.up_proj",
+    "model.layers.19.mlp.shared_experts.down_proj",
+    "model.layers.20.mlp.shared_experts.gate_proj",
+    "model.layers.20.mlp.shared_experts.up_proj",
+    "model.layers.20.mlp.shared_experts.down_proj",
+    "model.layers.21.mlp.shared_experts.gate_proj",
+    "model.layers.21.mlp.shared_experts.up_proj",
+    "model.layers.21.mlp.shared_experts.down_proj",
+    "model.layers.22.mlp.shared_experts.gate_proj",
+    "model.layers.22.mlp.shared_experts.up_proj",
+    "model.layers.22.mlp.shared_experts.down_proj",
+    "model.layers.23.mlp.shared_experts.gate_proj",
+    "model.layers.23.mlp.shared_experts.up_proj",
+    "model.layers.23.mlp.shared_experts.down_proj",
+    "model.layers.24.mlp.shared_experts.gate_proj",
+    "model.layers.24.mlp.shared_experts.up_proj",
+    "model.layers.24.mlp.shared_experts.down_proj",
+    "model.layers.25.mlp.shared_experts.gate_proj",
+    "model.layers.25.mlp.shared_experts.up_proj",
+    "model.layers.25.mlp.shared_experts.down_proj",
+    "model.layers.26.mlp.shared_experts.gate_proj",
+    "model.layers.26.mlp.shared_experts.up_proj",
+    "model.layers.26.mlp.shared_experts.down_proj",
+    "model.layers.27.mlp.shared_experts.gate_proj",
+    "model.layers.27.mlp.shared_experts.up_proj",
+    "model.layers.27.mlp.shared_experts.down_proj",
+    "model.layers.28.mlp.shared_experts.gate_proj",
+    "model.layers.28.mlp.shared_experts.up_proj",
+    "model.layers.28.mlp.shared_experts.down_proj",
+    "model.layers.29.mlp.shared_experts.gate_proj",
+    "model.layers.29.mlp.shared_experts.up_proj",
+    "model.layers.29.mlp.shared_experts.down_proj",
+    "model.layers.30.mlp.shared_experts.gate_proj",
+    "model.layers.30.mlp.shared_experts.up_proj",
+    "model.layers.30.mlp.shared_experts.down_proj",
+    "model.layers.31.mlp.shared_experts.gate_proj",
+    "model.layers.31.mlp.shared_experts.up_proj",
+    "model.layers.31.mlp.shared_experts.down_proj",
+    "model.layers.32.mlp.shared_experts.gate_proj",
+    "model.layers.32.mlp.shared_experts.up_proj",
+    "model.layers.32.mlp.shared_experts.down_proj",
+    "model.layers.33.mlp.shared_experts.gate_proj",
+    "model.layers.33.mlp.shared_experts.up_proj",
+    "model.layers.33.mlp.shared_experts.down_proj",
+    "model.layers.34.mlp.shared_experts.gate_proj",
+    "model.layers.34.mlp.shared_experts.up_proj",
+    "model.layers.34.mlp.shared_experts.down_proj",
+    "model.layers.35.mlp.shared_experts.gate_proj",
+    "model.layers.35.mlp.shared_experts.up_proj",
+    "model.layers.35.mlp.shared_experts.down_proj",
+    "model.layers.36.mlp.shared_experts.gate_proj",
+    "model.layers.36.mlp.shared_experts.up_proj",
+    "model.layers.36.mlp.shared_experts.down_proj",
+    "model.layers.37.mlp.shared_experts.gate_proj",
+    "model.layers.37.mlp.shared_experts.up_proj",
+    "model.layers.37.mlp.shared_experts.down_proj",
+    "model.layers.38.mlp.shared_experts.gate_proj",
+    "model.layers.38.mlp.shared_experts.up_proj",
+    "model.layers.38.mlp.shared_experts.down_proj",
+    "model.layers.39.mlp.shared_experts.gate_proj",
+    "model.layers.39.mlp.shared_experts.up_proj",
+    "model.layers.39.mlp.shared_experts.down_proj",
+    "model.layers.40.mlp.shared_experts.gate_proj",
+    "model.layers.40.mlp.shared_experts.up_proj",
+    "model.layers.40.mlp.shared_experts.down_proj",
+    "model.layers.41.mlp.shared_experts.gate_proj",
+    "model.layers.41.mlp.shared_experts.up_proj",
+    "model.layers.41.mlp.shared_experts.down_proj",
+    "model.layers.42.mlp.shared_experts.gate_proj",
+    "model.layers.42.mlp.shared_experts.up_proj",
+    "model.layers.42.mlp.shared_experts.down_proj",
+    "model.layers.43.mlp.shared_experts.gate_proj",
+    "model.layers.43.mlp.shared_experts.up_proj",
+    "model.layers.43.mlp.shared_experts.down_proj",
+    "model.layers.44.mlp.shared_experts.gate_proj",
+    "model.layers.44.mlp.shared_experts.up_proj",
+    "model.layers.44.mlp.shared_experts.down_proj",
+    "model.layers.45.mlp.shared_experts.gate_proj",
+    "model.layers.45.mlp.shared_experts.up_proj",
+    "model.layers.45.mlp.shared_experts.down_proj",
+    "model.layers.46.mlp.shared_experts.gate_proj",
+    "model.layers.46.mlp.shared_experts.up_proj",
+    "model.layers.46.mlp.shared_experts.down_proj",
+    "model.layers.47.mlp.shared_experts.gate_proj",
+    "model.layers.47.mlp.shared_experts.up_proj",
+    "model.layers.47.mlp.shared_experts.down_proj",
+    "model.layers.48.mlp.shared_experts.gate_proj",
+    "model.layers.48.mlp.shared_experts.up_proj",
+    "model.layers.48.mlp.shared_experts.down_proj",
+    "model.layers.49.mlp.shared_experts.gate_proj",
+    "model.layers.49.mlp.shared_experts.up_proj",
+    "model.layers.49.mlp.shared_experts.down_proj",
+    "model.layers.50.mlp.shared_experts.gate_proj",
+    "model.layers.50.mlp.shared_experts.up_proj",
+    "model.layers.50.mlp.shared_experts.down_proj",
+    "model.layers.51.mlp.shared_experts.gate_proj",
+    "model.layers.51.mlp.shared_experts.up_proj",
+    "model.layers.51.mlp.shared_experts.down_proj",
+    "model.layers.52.mlp.shared_experts.gate_proj",
+    "model.layers.52.mlp.shared_experts.up_proj",
+    "model.layers.52.mlp.shared_experts.down_proj",
+    "model.layers.53.mlp.shared_experts.gate_proj",
+    "model.layers.53.mlp.shared_experts.up_proj",
+    "model.layers.53.mlp.shared_experts.down_proj",
+    "model.layers.54.mlp.shared_experts.gate_proj",
+    "model.layers.54.mlp.shared_experts.up_proj",
+    "model.layers.54.mlp.shared_experts.down_proj",
+    "model.layers.55.mlp.shared_experts.gate_proj",
+    "model.layers.55.mlp.shared_experts.up_proj",
+    "model.layers.55.mlp.shared_experts.down_proj",
+    "model.layers.56.mlp.shared_experts.gate_proj",
+    "model.layers.56.mlp.shared_experts.up_proj",
+    "model.layers.56.mlp.shared_experts.down_proj",
+    "model.layers.57.mlp.shared_experts.gate_proj",
+    "model.layers.57.mlp.shared_experts.up_proj",
+    "model.layers.57.mlp.shared_experts.down_proj",
+    "model.layers.58.mlp.shared_experts.gate_proj",
+    "model.layers.58.mlp.shared_experts.up_proj",
+    "model.layers.58.mlp.shared_experts.down_proj",
+    "model.layers.59.mlp.shared_experts.gate_proj",
+    "model.layers.59.mlp.shared_experts.up_proj",
+    "model.layers.59.mlp.shared_experts.down_proj",
+    "model.layers.60.mlp.shared_experts.gate_proj",
+    "model.layers.60.mlp.shared_experts.up_proj",
+    "model.layers.60.mlp.shared_experts.down_proj",
+    "model.layers.61.mlp.shared_experts.gate_proj",
+    "model.layers.61.mlp.shared_experts.up_proj",
+    "model.layers.61.mlp.shared_experts.down_proj",
+    "model.layers.62.mlp.shared_experts.gate_proj",
+    "model.layers.62.mlp.shared_experts.up_proj",
+    "model.layers.62.mlp.shared_experts.down_proj",
+    "model.layers.63.mlp.shared_experts.gate_proj",
+    "model.layers.63.mlp.shared_experts.up_proj",
+    "model.layers.63.mlp.shared_experts.down_proj",
+    "model.layers.64.mlp.shared_experts.gate_proj",
+    "model.layers.64.mlp.shared_experts.up_proj",
+    "model.layers.64.mlp.shared_experts.down_proj",
+    "model.layers.65.mlp.shared_experts.gate_proj",
+    "model.layers.65.mlp.shared_experts.up_proj",
+    "model.layers.65.mlp.shared_experts.down_proj",
+    "model.layers.66.mlp.shared_experts.gate_proj",
+    "model.layers.66.mlp.shared_experts.up_proj",
+    "model.layers.66.mlp.shared_experts.down_proj",
+    "model.layers.67.mlp.shared_experts.gate_proj",
+    "model.layers.67.mlp.shared_experts.up_proj",
+    "model.layers.67.mlp.shared_experts.down_proj",
+    "model.layers.68.mlp.shared_experts.gate_proj",
+    "model.layers.68.mlp.shared_experts.up_proj",
+    "model.layers.68.mlp.shared_experts.down_proj",
+    "model.layers.69.mlp.shared_experts.gate_proj",
+    "model.layers.69.mlp.shared_experts.up_proj",
+    "model.layers.69.mlp.shared_experts.down_proj",
+    "model.layers.70.mlp.shared_experts.gate_proj",
+    "model.layers.70.mlp.shared_experts.up_proj",
+    "model.layers.70.mlp.shared_experts.down_proj",
+    "model.layers.71.mlp.shared_experts.gate_proj",
+    "model.layers.71.mlp.shared_experts.up_proj",
+    "model.layers.71.mlp.shared_experts.down_proj",
+    "model.layers.72.mlp.shared_experts.gate_proj",
+    "model.layers.72.mlp.shared_experts.up_proj",
+    "model.layers.72.mlp.shared_experts.down_proj",
+    "model.layers.73.mlp.shared_experts.gate_proj",
+    "model.layers.73.mlp.shared_experts.up_proj",
+    "model.layers.73.mlp.shared_experts.down_proj",
+    "model.layers.74.mlp.shared_experts.gate_proj",
+    "model.layers.74.mlp.shared_experts.up_proj",
+    "model.layers.74.mlp.shared_experts.down_proj",
+    "model.layers.75.mlp.shared_experts.gate_proj",
+    "model.layers.75.mlp.shared_experts.up_proj",
+    "model.layers.75.mlp.shared_experts.down_proj",
+    "model.layers.76.mlp.shared_experts.gate_proj",
+    "model.layers.76.mlp.shared_experts.up_proj",
+    "model.layers.76.mlp.shared_experts.down_proj",
+    "model.layers.77.mlp.shared_experts.gate_proj",
+    "model.layers.77.mlp.shared_experts.up_proj",
+    "model.layers.77.mlp.shared_experts.down_proj",
+    "model.layers.78.mlp.shared_experts.gate_proj",
+    "model.layers.78.mlp.shared_experts.up_proj",
+    "model.layers.78.mlp.shared_experts.down_proj",
+    "model.layers.79.mlp.shared_experts.gate_proj",
+    "model.layers.79.mlp.shared_experts.up_proj",
+    "model.layers.79.mlp.shared_experts.down_proj",
+    "model.layers.80.mlp.shared_experts.gate_proj",
+    "model.layers.80.mlp.shared_experts.up_proj",
+    "model.layers.80.mlp.shared_experts.down_proj",
+    "model.layers.81.mlp.shared_experts.gate_proj",
+    "model.layers.81.mlp.shared_experts.up_proj",
+    "model.layers.81.mlp.shared_experts.down_proj",
+    "model.layers.82.mlp.shared_experts.gate_proj",
+    "model.layers.82.mlp.shared_experts.up_proj",
+    "model.layers.82.mlp.shared_experts.down_proj",
+    "model.layers.83.mlp.shared_experts.gate_proj",
+    "model.layers.83.mlp.shared_experts.up_proj",
+    "model.layers.83.mlp.shared_experts.down_proj",
+    "model.layers.84.mlp.shared_experts.gate_proj",
+    "model.layers.84.mlp.shared_experts.up_proj",
+    "model.layers.84.mlp.shared_experts.down_proj",
+    "model.layers.85.mlp.shared_experts.gate_proj",
+    "model.layers.85.mlp.shared_experts.up_proj",
+    "model.layers.85.mlp.shared_experts.down_proj",
+    "model.layers.86.mlp.shared_experts.gate_proj",
+    "model.layers.86.mlp.shared_experts.up_proj",
+    "model.layers.86.mlp.shared_experts.down_proj",
+    "model.layers.87.mlp.shared_experts.gate_proj",
+    "model.layers.87.mlp.shared_experts.up_proj",
+    "model.layers.87.mlp.shared_experts.down_proj",
+    "model.layers.88.mlp.shared_experts.gate_proj",
+    "model.layers.88.mlp.shared_experts.up_proj",
+    "model.layers.88.mlp.shared_experts.down_proj",
+    "model.layers.89.mlp.shared_experts.gate_proj",
+    "model.layers.89.mlp.shared_experts.up_proj",
+    "model.layers.89.mlp.shared_experts.down_proj",
+    "model.layers.90.mlp.shared_experts.gate_proj",
+    "model.layers.90.mlp.shared_experts.up_proj",
+    "model.layers.90.mlp.shared_experts.down_proj",
+    "model.layers.91.mlp.shared_experts.gate_proj",
+    "model.layers.91.mlp.shared_experts.up_proj",
+    "model.layers.91.mlp.shared_experts.down_proj",
+
+    # Ignore the output head
+    "lm_head",
+]
+
+# Create explicit mappings that skip layers 0-2
+mappings = []
+for layer_idx in range(3, 92):  # Skip dense layers 0-2
+    mappings.append(
+        AWQMapping(
+            smooth_layer=f"model.layers.{layer_idx}.input_layernorm",
+            balance_layers=[
+                f"model.layers.{layer_idx}.self_attn.q_proj",
+                f"model.layers.{layer_idx}.self_attn.k_proj",
+                f"model.layers.{layer_idx}.self_attn.v_proj",
+            ]
+        )
+    )
+
+recipe = [
+    AWQModifier(
+        ignore=moe_ignores,
+        mappings=mappings,  # Provide explicit mappings
+        config_groups={
+            "group_0": {
+                "targets": ["Linear"],
+                "weights": {
+                    "num_bits": 4,
+                    "type": "int",
+                    "symmetric": True,   # W4A16 (symmetric)
+                    "strategy": "group",
+                    "group_size": 32,
+                    "dynamic": False,
+                },
+            },
+        },
+    ),
+]
+
+# =========================
+# Quantize + save (writes quantization_config for vLLM)
+# =========================
+SAVE_DIR = require_env("DST_DIR")
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+#    output_dir=SAVE_DIR,
+)
+
+# Fix generation config validation issue before saving
+if hasattr(model, 'generation_config') and model.generation_config is not None:
+    # If temperature is set but do_sample is False, either enable do_sample or remove temperature
+    if hasattr(model.generation_config, 'temperature') and model.generation_config.temperature is not None:
+        if not getattr(model.generation_config, 'do_sample', False):
+            # Set do_sample=True to make temperature valid, or remove temperature
+            model.generation_config.do_sample = True
+
+# (Optional redundant save)
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+print("Saved to:", SAVE_DIR)
+

From d9913c9dcdeae12f61fd61777c644620a8c6f14f Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 6 Jan 2026 11:47:51 -0600
Subject: [PATCH 06/17] Changing Ignore of first three dense layers to Regex
 matching pattern.

Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com>
---
 examples/quantizing_moe/glm4.7_example.py | 29 +++--------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4.7_example.py
index 3c9c1b6ecf..3f57053360 100644
--- a/examples/quantizing_moe/glm4.7_example.py
+++ b/examples/quantizing_moe/glm4.7_example.py
@@ -148,32 +148,9 @@ def preprocess_rombo(batch):
 # =========================
 
 moe_ignores = [
-    # Layer 0: Dense layer - ignore attention and MLP
-    "model.layers.0.self_attn.q_proj",
-    "model.layers.0.self_attn.k_proj",
-    "model.layers.0.self_attn.v_proj",
-    "model.layers.0.self_attn.o_proj",
-    "model.layers.0.mlp.gate_proj",
-    "model.layers.0.mlp.up_proj",
-    "model.layers.0.mlp.down_proj",
-
-    # Layer 1: Dense layer - ignore attention and MLP
-    "model.layers.1.self_attn.q_proj",
-    "model.layers.1.self_attn.k_proj",
-    "model.layers.1.self_attn.v_proj",
-    "model.layers.1.self_attn.o_proj",
-    "model.layers.1.mlp.gate_proj",
-    "model.layers.1.mlp.up_proj",
-    "model.layers.1.mlp.down_proj",
-
-    # Layer 2: Dense layer - ignore attention and MLP
-    "model.layers.2.self_attn.q_proj",
-    "model.layers.2.self_attn.k_proj",
-    "model.layers.2.self_attn.v_proj",
-    "model.layers.2.self_attn.o_proj",
-    "model.layers.2.mlp.gate_proj",
-    "model.layers.2.mlp.up_proj",
-    "model.layers.2.mlp.down_proj",
+    # Layers 0-2: Dense layer - ignore attention and MLP
+    "model.layers.[0-2].self_attn.(q|k|v|o)_proj",
+    "model.layers.[0-2].mlp.(gate|up|down)_proj",
 
     # Layers 3-91: MoE layers - ignore shared_experts
     "model.layers.3.mlp.shared_experts.gate_proj",

From f4d1e9502d8856d38e1a29f1391d01cf9c4a19b7 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 6 Jan 2026 11:48:43 -0600
Subject: [PATCH 07/17] Updating non dense layers to also use a Regex in the
 Ignore Section.

Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com>
---
 examples/quantizing_moe/glm4.7_example.py | 268 +---------------------
 1 file changed, 1 insertion(+), 267 deletions(-)

diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4.7_example.py
index 3f57053360..35dfca4a4b 100644
--- a/examples/quantizing_moe/glm4.7_example.py
+++ b/examples/quantizing_moe/glm4.7_example.py
@@ -153,273 +153,7 @@ def preprocess_rombo(batch):
     "model.layers.[0-2].mlp.(gate|up|down)_proj",
 
     # Layers 3-91: MoE layers - ignore shared_experts
-    "model.layers.3.mlp.shared_experts.gate_proj",
-    "model.layers.3.mlp.shared_experts.up_proj",
-    "model.layers.3.mlp.shared_experts.down_proj",
-    "model.layers.4.mlp.shared_experts.gate_proj",
-    "model.layers.4.mlp.shared_experts.up_proj",
-    "model.layers.4.mlp.shared_experts.down_proj",
-    "model.layers.5.mlp.shared_experts.gate_proj",
-    "model.layers.5.mlp.shared_experts.up_proj",
-    "model.layers.5.mlp.shared_experts.down_proj",
-    "model.layers.6.mlp.shared_experts.gate_proj",
-    "model.layers.6.mlp.shared_experts.up_proj",
-    "model.layers.6.mlp.shared_experts.down_proj",
-    "model.layers.7.mlp.shared_experts.gate_proj",
-    "model.layers.7.mlp.shared_experts.up_proj",
-    "model.layers.7.mlp.shared_experts.down_proj",
-    "model.layers.8.mlp.shared_experts.gate_proj",
-    "model.layers.8.mlp.shared_experts.up_proj",
-    "model.layers.8.mlp.shared_experts.down_proj",
-    "model.layers.9.mlp.shared_experts.gate_proj",
-    "model.layers.9.mlp.shared_experts.up_proj",
-    "model.layers.9.mlp.shared_experts.down_proj",
-    "model.layers.10.mlp.shared_experts.gate_proj",
-    "model.layers.10.mlp.shared_experts.up_proj",
-    "model.layers.10.mlp.shared_experts.down_proj",
-    "model.layers.11.mlp.shared_experts.gate_proj",
-    "model.layers.11.mlp.shared_experts.up_proj",
-    "model.layers.11.mlp.shared_experts.down_proj",
-    "model.layers.12.mlp.shared_experts.gate_proj",
-    "model.layers.12.mlp.shared_experts.up_proj",
-    "model.layers.12.mlp.shared_experts.down_proj",
-    "model.layers.13.mlp.shared_experts.gate_proj",
-    "model.layers.13.mlp.shared_experts.up_proj",
-    "model.layers.13.mlp.shared_experts.down_proj",
-    "model.layers.14.mlp.shared_experts.gate_proj",
-    "model.layers.14.mlp.shared_experts.up_proj",
-    "model.layers.14.mlp.shared_experts.down_proj",
-    "model.layers.15.mlp.shared_experts.gate_proj",
-    "model.layers.15.mlp.shared_experts.up_proj",
-    "model.layers.15.mlp.shared_experts.down_proj",
-    "model.layers.16.mlp.shared_experts.gate_proj",
-    "model.layers.16.mlp.shared_experts.up_proj",
-    "model.layers.16.mlp.shared_experts.down_proj",
-    "model.layers.17.mlp.shared_experts.gate_proj",
-    "model.layers.17.mlp.shared_experts.up_proj",
-    "model.layers.17.mlp.shared_experts.down_proj",
-    "model.layers.18.mlp.shared_experts.gate_proj",
-    "model.layers.18.mlp.shared_experts.up_proj",
-    "model.layers.18.mlp.shared_experts.down_proj",
-    "model.layers.19.mlp.shared_experts.gate_proj",
-    "model.layers.19.mlp.shared_experts.up_proj",
-    "model.layers.19.mlp.shared_experts.down_proj",
-    "model.layers.20.mlp.shared_experts.gate_proj",
-    "model.layers.20.mlp.shared_experts.up_proj",
-    "model.layers.20.mlp.shared_experts.down_proj",
-    "model.layers.21.mlp.shared_experts.gate_proj",
-    "model.layers.21.mlp.shared_experts.up_proj",
-    "model.layers.21.mlp.shared_experts.down_proj",
-    "model.layers.22.mlp.shared_experts.gate_proj",
-    "model.layers.22.mlp.shared_experts.up_proj",
-    "model.layers.22.mlp.shared_experts.down_proj",
-    "model.layers.23.mlp.shared_experts.gate_proj",
-    "model.layers.23.mlp.shared_experts.up_proj",
-    "model.layers.23.mlp.shared_experts.down_proj",
-    "model.layers.24.mlp.shared_experts.gate_proj",
-    "model.layers.24.mlp.shared_experts.up_proj",
-    "model.layers.24.mlp.shared_experts.down_proj",
-    "model.layers.25.mlp.shared_experts.gate_proj",
-    "model.layers.25.mlp.shared_experts.up_proj",
-    "model.layers.25.mlp.shared_experts.down_proj",
-    "model.layers.26.mlp.shared_experts.gate_proj",
-    "model.layers.26.mlp.shared_experts.up_proj",
-    "model.layers.26.mlp.shared_experts.down_proj",
-    "model.layers.27.mlp.shared_experts.gate_proj",
-    "model.layers.27.mlp.shared_experts.up_proj",
-    "model.layers.27.mlp.shared_experts.down_proj",
-    "model.layers.28.mlp.shared_experts.gate_proj",
-    "model.layers.28.mlp.shared_experts.up_proj",
-    "model.layers.28.mlp.shared_experts.down_proj",
-    "model.layers.29.mlp.shared_experts.gate_proj",
-    "model.layers.29.mlp.shared_experts.up_proj",
-    "model.layers.29.mlp.shared_experts.down_proj",
-    "model.layers.30.mlp.shared_experts.gate_proj",
-    "model.layers.30.mlp.shared_experts.up_proj",
-    "model.layers.30.mlp.shared_experts.down_proj",
-    "model.layers.31.mlp.shared_experts.gate_proj",
-    "model.layers.31.mlp.shared_experts.up_proj",
-    "model.layers.31.mlp.shared_experts.down_proj",
-    "model.layers.32.mlp.shared_experts.gate_proj",
-    "model.layers.32.mlp.shared_experts.up_proj",
-    "model.layers.32.mlp.shared_experts.down_proj",
-    "model.layers.33.mlp.shared_experts.gate_proj",
-    "model.layers.33.mlp.shared_experts.up_proj",
-    "model.layers.33.mlp.shared_experts.down_proj",
-    "model.layers.34.mlp.shared_experts.gate_proj",
-    "model.layers.34.mlp.shared_experts.up_proj",
-    "model.layers.34.mlp.shared_experts.down_proj",
-    "model.layers.35.mlp.shared_experts.gate_proj",
-    "model.layers.35.mlp.shared_experts.up_proj",
-    "model.layers.35.mlp.shared_experts.down_proj",
-    "model.layers.36.mlp.shared_experts.gate_proj",
-    "model.layers.36.mlp.shared_experts.up_proj",
-    "model.layers.36.mlp.shared_experts.down_proj",
-    "model.layers.37.mlp.shared_experts.gate_proj",
-    "model.layers.37.mlp.shared_experts.up_proj",
-    "model.layers.37.mlp.shared_experts.down_proj",
-    "model.layers.38.mlp.shared_experts.gate_proj",
-    "model.layers.38.mlp.shared_experts.up_proj",
-    "model.layers.38.mlp.shared_experts.down_proj",
-    "model.layers.39.mlp.shared_experts.gate_proj",
-    "model.layers.39.mlp.shared_experts.up_proj",
-    "model.layers.39.mlp.shared_experts.down_proj",
-    "model.layers.40.mlp.shared_experts.gate_proj",
-    "model.layers.40.mlp.shared_experts.up_proj",
-    "model.layers.40.mlp.shared_experts.down_proj",
-    "model.layers.41.mlp.shared_experts.gate_proj",
-    "model.layers.41.mlp.shared_experts.up_proj",
-    "model.layers.41.mlp.shared_experts.down_proj",
-    "model.layers.42.mlp.shared_experts.gate_proj",
-    "model.layers.42.mlp.shared_experts.up_proj",
-    "model.layers.42.mlp.shared_experts.down_proj",
-    "model.layers.43.mlp.shared_experts.gate_proj",
-    "model.layers.43.mlp.shared_experts.up_proj",
-    "model.layers.43.mlp.shared_experts.down_proj",
-    "model.layers.44.mlp.shared_experts.gate_proj",
-    "model.layers.44.mlp.shared_experts.up_proj",
-    "model.layers.44.mlp.shared_experts.down_proj",
-    "model.layers.45.mlp.shared_experts.gate_proj",
-    "model.layers.45.mlp.shared_experts.up_proj",
-    "model.layers.45.mlp.shared_experts.down_proj",
-    "model.layers.46.mlp.shared_experts.gate_proj",
-    "model.layers.46.mlp.shared_experts.up_proj",
-    "model.layers.46.mlp.shared_experts.down_proj",
-    "model.layers.47.mlp.shared_experts.gate_proj",
-    "model.layers.47.mlp.shared_experts.up_proj",
-    "model.layers.47.mlp.shared_experts.down_proj",
-    "model.layers.48.mlp.shared_experts.gate_proj",
-    "model.layers.48.mlp.shared_experts.up_proj",
-    "model.layers.48.mlp.shared_experts.down_proj",
-    "model.layers.49.mlp.shared_experts.gate_proj",
-    "model.layers.49.mlp.shared_experts.up_proj",
-    "model.layers.49.mlp.shared_experts.down_proj",
-    "model.layers.50.mlp.shared_experts.gate_proj",
-    "model.layers.50.mlp.shared_experts.up_proj",
-    "model.layers.50.mlp.shared_experts.down_proj",
-    "model.layers.51.mlp.shared_experts.gate_proj",
-    "model.layers.51.mlp.shared_experts.up_proj",
-    "model.layers.51.mlp.shared_experts.down_proj",
-    "model.layers.52.mlp.shared_experts.gate_proj",
-    "model.layers.52.mlp.shared_experts.up_proj",
-    "model.layers.52.mlp.shared_experts.down_proj",
-    "model.layers.53.mlp.shared_experts.gate_proj",
-    "model.layers.53.mlp.shared_experts.up_proj",
-    "model.layers.53.mlp.shared_experts.down_proj",
-    "model.layers.54.mlp.shared_experts.gate_proj",
-    "model.layers.54.mlp.shared_experts.up_proj",
-    "model.layers.54.mlp.shared_experts.down_proj",
-    "model.layers.55.mlp.shared_experts.gate_proj",
-    "model.layers.55.mlp.shared_experts.up_proj",
-    "model.layers.55.mlp.shared_experts.down_proj",
-    "model.layers.56.mlp.shared_experts.gate_proj",
-    "model.layers.56.mlp.shared_experts.up_proj",
-    "model.layers.56.mlp.shared_experts.down_proj",
-    "model.layers.57.mlp.shared_experts.gate_proj",
-    "model.layers.57.mlp.shared_experts.up_proj",
-    "model.layers.57.mlp.shared_experts.down_proj",
-    "model.layers.58.mlp.shared_experts.gate_proj",
-    "model.layers.58.mlp.shared_experts.up_proj",
-    "model.layers.58.mlp.shared_experts.down_proj",
-    "model.layers.59.mlp.shared_experts.gate_proj",
-    "model.layers.59.mlp.shared_experts.up_proj",
-    "model.layers.59.mlp.shared_experts.down_proj",
-    "model.layers.60.mlp.shared_experts.gate_proj",
-    "model.layers.60.mlp.shared_experts.up_proj",
-    "model.layers.60.mlp.shared_experts.down_proj",
-    "model.layers.61.mlp.shared_experts.gate_proj",
-    "model.layers.61.mlp.shared_experts.up_proj",
-    "model.layers.61.mlp.shared_experts.down_proj",
-    "model.layers.62.mlp.shared_experts.gate_proj",
-    "model.layers.62.mlp.shared_experts.up_proj",
-    "model.layers.62.mlp.shared_experts.down_proj",
-    "model.layers.63.mlp.shared_experts.gate_proj",
-    "model.layers.63.mlp.shared_experts.up_proj",
-    "model.layers.63.mlp.shared_experts.down_proj",
-    "model.layers.64.mlp.shared_experts.gate_proj",
-    "model.layers.64.mlp.shared_experts.up_proj",
-    "model.layers.64.mlp.shared_experts.down_proj",
-    "model.layers.65.mlp.shared_experts.gate_proj",
-    "model.layers.65.mlp.shared_experts.up_proj",
-    "model.layers.65.mlp.shared_experts.down_proj",
-    "model.layers.66.mlp.shared_experts.gate_proj",
-    "model.layers.66.mlp.shared_experts.up_proj",
-    "model.layers.66.mlp.shared_experts.down_proj",
-    "model.layers.67.mlp.shared_experts.gate_proj",
-    "model.layers.67.mlp.shared_experts.up_proj",
-    "model.layers.67.mlp.shared_experts.down_proj",
-    "model.layers.68.mlp.shared_experts.gate_proj",
-    "model.layers.68.mlp.shared_experts.up_proj",
-    "model.layers.68.mlp.shared_experts.down_proj",
-    "model.layers.69.mlp.shared_experts.gate_proj",
-    "model.layers.69.mlp.shared_experts.up_proj",
-    "model.layers.69.mlp.shared_experts.down_proj",
-    "model.layers.70.mlp.shared_experts.gate_proj",
-    "model.layers.70.mlp.shared_experts.up_proj",
-    "model.layers.70.mlp.shared_experts.down_proj",
-    "model.layers.71.mlp.shared_experts.gate_proj",
-    "model.layers.71.mlp.shared_experts.up_proj",
-    "model.layers.71.mlp.shared_experts.down_proj",
-    "model.layers.72.mlp.shared_experts.gate_proj",
-    "model.layers.72.mlp.shared_experts.up_proj",
-    "model.layers.72.mlp.shared_experts.down_proj",
-    "model.layers.73.mlp.shared_experts.gate_proj",
-    "model.layers.73.mlp.shared_experts.up_proj",
-    "model.layers.73.mlp.shared_experts.down_proj",
-    "model.layers.74.mlp.shared_experts.gate_proj",
-    "model.layers.74.mlp.shared_experts.up_proj",
-    "model.layers.74.mlp.shared_experts.down_proj",
-    "model.layers.75.mlp.shared_experts.gate_proj",
-    "model.layers.75.mlp.shared_experts.up_proj",
-    "model.layers.75.mlp.shared_experts.down_proj",
-    "model.layers.76.mlp.shared_experts.gate_proj",
-    "model.layers.76.mlp.shared_experts.up_proj",
-    "model.layers.76.mlp.shared_experts.down_proj",
-    "model.layers.77.mlp.shared_experts.gate_proj",
-    "model.layers.77.mlp.shared_experts.up_proj",
-    "model.layers.77.mlp.shared_experts.down_proj",
-    "model.layers.78.mlp.shared_experts.gate_proj",
-    "model.layers.78.mlp.shared_experts.up_proj",
-    "model.layers.78.mlp.shared_experts.down_proj",
-    "model.layers.79.mlp.shared_experts.gate_proj",
-    "model.layers.79.mlp.shared_experts.up_proj",
-    "model.layers.79.mlp.shared_experts.down_proj",
-    "model.layers.80.mlp.shared_experts.gate_proj",
-    "model.layers.80.mlp.shared_experts.up_proj",
-    "model.layers.80.mlp.shared_experts.down_proj",
-    "model.layers.81.mlp.shared_experts.gate_proj",
-    "model.layers.81.mlp.shared_experts.up_proj",
-    "model.layers.81.mlp.shared_experts.down_proj",
-    "model.layers.82.mlp.shared_experts.gate_proj",
-    "model.layers.82.mlp.shared_experts.up_proj",
-    "model.layers.82.mlp.shared_experts.down_proj",
-    "model.layers.83.mlp.shared_experts.gate_proj",
-    "model.layers.83.mlp.shared_experts.up_proj",
-    "model.layers.83.mlp.shared_experts.down_proj",
-    "model.layers.84.mlp.shared_experts.gate_proj",
-    "model.layers.84.mlp.shared_experts.up_proj",
-    "model.layers.84.mlp.shared_experts.down_proj",
-    "model.layers.85.mlp.shared_experts.gate_proj",
-    "model.layers.85.mlp.shared_experts.up_proj",
-    "model.layers.85.mlp.shared_experts.down_proj",
-    "model.layers.86.mlp.shared_experts.gate_proj",
-    "model.layers.86.mlp.shared_experts.up_proj",
-    "model.layers.86.mlp.shared_experts.down_proj",
-    "model.layers.87.mlp.shared_experts.gate_proj",
-    "model.layers.87.mlp.shared_experts.up_proj",
-    "model.layers.87.mlp.shared_experts.down_proj",
-    "model.layers.88.mlp.shared_experts.gate_proj",
-    "model.layers.88.mlp.shared_experts.up_proj",
-    "model.layers.88.mlp.shared_experts.down_proj",
-    "model.layers.89.mlp.shared_experts.gate_proj",
-    "model.layers.89.mlp.shared_experts.up_proj",
-    "model.layers.89.mlp.shared_experts.down_proj",
-    "model.layers.90.mlp.shared_experts.gate_proj",
-    "model.layers.90.mlp.shared_experts.up_proj",
-    "model.layers.90.mlp.shared_experts.down_proj",
-    "model.layers.91.mlp.shared_experts.gate_proj",
-    "model.layers.91.mlp.shared_experts.up_proj",
-    "model.layers.91.mlp.shared_experts.down_proj",
+    "re:.*model.layers.([3-9]|[1-8][0-9]|9[01]).mlp.shared_experts.(gate|up|down)_proj",
 
     # Ignore the output head
     "lm_head",

From 5429a71a3092e55ed156f6e212af96374a5edf53 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 6 Jan 2026 11:53:42 -0600
Subject: [PATCH 08/17] Update the Test_Calib_glm4_moe.py with proper stub
 directory. Signed-off-by: phaelon74
 <33295008+phaelon74@users.noreply.github.com>

---
 tests/llmcompressor/modeling/test_calib_glm4_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
index 4f4449b3c2..992f0c53c6 100644
--- a/tests/llmcompressor/modeling/test_calib_glm4_moe.py
+++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
@@ -22,7 +22,7 @@
 
 
 @requires_cadence("weekly")
-@pytest.mark.parametrize("model_stub", ["THUDM/glm-4-9b-chat"])  # Update with actual GLM4 MoE model stub
+@pytest.mark.parametrize("model_stub", ["zai-org/GLM-4.7"])  # Update with actual GLM4 MoE model stub
 def test_calib_replace_glm4moe_all_experts(model_stub):
     with skip_weights_download():
         model = AutoModelForCausalLM.from_pretrained(model_stub, trust_remote_code=True)

From 8caaf284c657b4965a12d5293962a688dc72b59a Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 6 Jan 2026 12:07:00 -0600
Subject: [PATCH 09/17] Updating the Example script to use argument paramaters
 at script launch instead of a .env file. Signed-off-by: phaelon74
 <33295008+phaelon74@users.noreply.github.com>

---
 examples/quantizing_moe/glm4.7_example.py | 41 +++++++++++------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4.7_example.py
index 35dfca4a4b..45fa903130 100644
--- a/examples/quantizing_moe/glm4.7_example.py
+++ b/examples/quantizing_moe/glm4.7_example.py
@@ -1,5 +1,4 @@
-import os
-from pathlib import Path
+import argparse
 
 from datasets import load_dataset, concatenate_datasets
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -11,35 +10,33 @@
 # This script does W4A16 AWQ quantization of the GLM-4.7 model.  It uses Group Size of 32 and two datasets (one specific for quantization and one for reasoning models)
 # Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB of VRAM used and roughly ~3.5 hours of run time.
 # This model script uses the glm4 modeling file to make sure that for each calibration sample, all experts are engaged.
-# This script also uses a local .ENV file, for Source and Destination.  Change as needed.
+# This script accepts command-line arguments for source and destination directories.
 # GLM 4.7 has Dense layers for the first three layers, so we skip multiple sections of those layers.  We then need to add all of that to a mapping, to apply it during quantization.
 
 
 # =========================
-# Load ENV Variables
+# Parse Command-Line Arguments
 # =========================
-from dotenv import load_dotenv
-
-# Load the .env that sits next to this script (works regardless of where you run it)
-# The .env file should be in the directory this script is run from and should look like the following:
-# SRC_DIR=/media/fmodels/zai-org/GLM-4.7/
-# DST_DIR=/media/fmodels/TheHouseOfTheDude/GLM-4.7_Compressed-Tensors/W4A16_GS32
-# Those two lines are all that's needed.
-load_dotenv(Path(__file__).with_name(".env"))
-
-def require_env(key: str) -> str:
-    val = os.getenv(key)
-    if not val or not val.strip():
-        raise RuntimeError(f"Missing environment variable: {key}")
-    return val.strip()
+parser = argparse.ArgumentParser(description="Run W4A16 AWQ quantization on GLM-4.7 model.")
+parser.add_argument(
+    "model_path",
+    type=str,
+    help="Path to the source model directory."
+)
+parser.add_argument(
+    "output_path",
+    type=str,
+    help="Path to the destination directory for saving quantized model."
+)
 
-SRC_DIR = require_env("SRC_DIR")
-DST_DIR = require_env("DST_DIR")
+args = parser.parse_args()
+model_path = args.model_path
+output_path = args.output_path
 
 # =========================
 # Model (GLM / GLM-MoE)
 # =========================
-MODEL_ID = require_env("SRC_DIR")
+MODEL_ID = model_path
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
@@ -196,7 +193,7 @@ def preprocess_rombo(batch):
 # =========================
 # Quantize + save (writes quantization_config for vLLM)
 # =========================
-SAVE_DIR = require_env("DST_DIR")
+SAVE_DIR = output_path
 
 oneshot(
     model=model,

From 9d3618beb0ea79a683074789c27f5d4f9e0045aa Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 6 Jan 2026 15:06:28 -0600
Subject: [PATCH 10/17] Address the items identified during quality check.
 Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com>

---
 .../{glm4.7_example.py => glm4_7_example.py}  | 52 +++++++++++++------
 src/llmcompressor/modeling/glm4_moe.py        |  8 ++-
 .../modeling/test_calib_glm4_moe.py           |  2 +-
 3 files changed, 39 insertions(+), 23 deletions(-)
 rename examples/quantizing_moe/{glm4.7_example.py => glm4_7_example.py} (86%)

diff --git a/examples/quantizing_moe/glm4.7_example.py b/examples/quantizing_moe/glm4_7_example.py
similarity index 86%
rename from examples/quantizing_moe/glm4.7_example.py
rename to examples/quantizing_moe/glm4_7_example.py
index 45fa903130..fb82039334 100644
--- a/examples/quantizing_moe/glm4.7_example.py
+++ b/examples/quantizing_moe/glm4_7_example.py
@@ -1,23 +1,32 @@
 import argparse
 
-from datasets import load_dataset, concatenate_datasets
+from datasets import concatenate_datasets, load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
-from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE
-
-# This script does W4A16 AWQ quantization of the GLM-4.7 model.  It uses Group Size of 32 and two datasets (one specific for quantization and one for reasoning models)
-# Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB of VRAM used and roughly ~3.5 hours of run time.
-# This model script uses the glm4 modeling file to make sure that for each calibration sample, all experts are engaged.
-# This script accepts command-line arguments for source and destination directories.
-# GLM 4.7 has Dense layers for the first three layers, so we skip multiple sections of those layers.  We then need to add all of that to a mapping, to apply it during quantization.
+from llmcompressor.modifiers.awq import AWQMapping, AWQModifier
+from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE  # noqa: F401
+
+# This script does W4A16 AWQ quantization of the GLM-4.7 model.
+# It uses Group Size of 32 and two datasets (one specific for quantization
+# and one for reasoning models).
+# Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB
+# of VRAM used and roughly ~3.5 hours of run time.
+# This model script uses the glm4 modeling file to make sure that for each
+# calibration sample, all experts are engaged.
+# This script accepts command-line arguments for source and destination
+# directories.
+# GLM 4.7 has Dense layers for the first three layers, so we skip multiple
+# sections of those layers. We then need to add all of that to a mapping,
+# to apply it during quantization.
 
 
 # =========================
 # Parse Command-Line Arguments
 # =========================
-parser = argparse.ArgumentParser(description="Run W4A16 AWQ quantization on GLM-4.7 model.")
+parser = argparse.ArgumentParser(
+    description="Run W4A16 AWQ quantization on GLM-4.7 model."
+)
 parser.add_argument(
     "model_path",
     type=str,
@@ -50,7 +59,10 @@
 NUM_NEURALMAGIC = int(NUM_CALIBRATION_SAMPLES * 0.6)  # ~307 samples
 NUM_ROMBO = NUM_CALIBRATION_SAMPLES - NUM_NEURALMAGIC  # ~205 samples
 
-print(f"Loading calibration datasets: {NUM_NEURALMAGIC} from Neural Magic, {NUM_ROMBO} from Rombo")
+print(
+    f"Loading calibration datasets: {NUM_NEURALMAGIC} from Neural Magic, "
+    f"{NUM_ROMBO} from Rombo"
+)
 
 # Load Neural Magic dataset
 neuralmagic_dataset_id = "neuralmagic/LLM_compression_calibration"
@@ -90,23 +102,25 @@ def preprocess_neuralmagic(batch):
 # Format: {"instruction": "", "input": [""], "output": [""]}
 def preprocess_rombo(batch):
     rendered = []
-    for instruction, inputs, outputs in zip(batch["instruction"], batch["input"], batch["output"]):
+    for instruction, inputs, outputs in zip(
+        batch["instruction"], batch["input"], batch["output"]
+    ):
         # Construct text from instruction, input, and output
         # Combine instruction with all input/output pairs
         text_parts = [instruction]
-        
+
         # Handle input array (may contain multiple items)
         if isinstance(inputs, list) and len(inputs) > 0:
             for inp in inputs:
                 if inp and inp.strip():
                     text_parts.append(f"\n\nInput: {inp}")
-        
+
         # Handle output array (may contain multiple items)
         if isinstance(outputs, list) and len(outputs) > 0:
             for out in outputs:
                 if out and out.strip():
                     text_parts.append(f"\n\nOutput: {out}")
-        
+
         # Join all parts
         text = "".join(text_parts)
         rendered.append(text)
@@ -206,8 +220,12 @@ def preprocess_rombo(batch):
 
 # Fix generation config validation issue before saving
 if hasattr(model, 'generation_config') and model.generation_config is not None:
-    # If temperature is set but do_sample is False, either enable do_sample or remove temperature
-    if hasattr(model.generation_config, 'temperature') and model.generation_config.temperature is not None:
+    # If temperature is set but do_sample is False, either enable do_sample
+    # or remove temperature
+    if (
+        hasattr(model.generation_config, 'temperature')
+        and model.generation_config.temperature is not None
+    ):
         if not getattr(model.generation_config, 'do_sample', False):
             # Set do_sample=True to make temperature valid, or remove temperature
             model.generation_config.do_sample = True
diff --git a/src/llmcompressor/modeling/glm4_moe.py b/src/llmcompressor/modeling/glm4_moe.py
index c33dc878b3..b588534934 100644
--- a/src/llmcompressor/modeling/glm4_moe.py
+++ b/src/llmcompressor/modeling/glm4_moe.py
@@ -11,7 +11,6 @@
 class CalibrationGlm4MoeMoE(MoECalibrationModule):
     """
     Calibration version of Glm4MoeMoE that sends all tokens to all experts.
-    
     During calibration, when calibrate_all_experts=True, all tokens are sent to
     all experts to ensure proper quantization statistics are collected for every
     expert, not just those activated by the calibration data routing.
@@ -35,12 +34,10 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
         Forward pass with optional calibration mode.
-        
         When calibrate_all_experts=True:
             - All tokens are sent to all experts for calibration
             - Routing weights are still used for final output combination
             - This ensures all experts see calibration data
-        
         When calibrate_all_experts=False:
             - Normal MoE routing behavior (only routed tokens go to each expert)
         """
@@ -66,7 +63,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                 # The output is still calculated using only the routed tokens.
                 expert_output_full = expert(hidden_states)
                 if not has_tokens:
-                    continue  # No tokens routed to this expert, but stats were gathered.
+                    # No tokens routed to this expert, but stats were gathered.
+                    continue
                 expert_output = expert_output_full[token_indices]
             else:
                 # Standard MoE behavior: only process tokens routed to this expert.
@@ -87,7 +85,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     def restore(self, original: torch.nn.Module) -> torch.nn.Module:
         """
         Restore the original module structure.
-        
+
         Since is_permanent=False, this method is called when exiting
         the calibration context to restore the original MoE module.
         """
diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
index 992f0c53c6..91858ca6c4 100644
--- a/tests/llmcompressor/modeling/test_calib_glm4_moe.py
+++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
@@ -22,7 +22,7 @@
 
 
 @requires_cadence("weekly")
-@pytest.mark.parametrize("model_stub", ["zai-org/GLM-4.7"])  # Update with actual GLM4 MoE model stub
+@pytest.mark.parametrize("model_stub", ["zai-org/GLM-4.7"])
 def test_calib_replace_glm4moe_all_experts(model_stub):
     with skip_weights_download():
         model = AutoModelForCausalLM.from_pretrained(model_stub, trust_remote_code=True)

From 3ffa14d6e5a64d0fd4cdcf08cad3af64bd5eae1e Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Wed, 7 Jan 2026 14:05:08 -0600
Subject: [PATCH 11/17] Updating the order of datasets import. Signed-off-by:
 phaelon74 <33295008+phaelon74@users.noreply.github.com>

---
 examples/quantizing_moe/glm4_7_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py
index fb82039334..3726b7f989 100644
--- a/examples/quantizing_moe/glm4_7_example.py
+++ b/examples/quantizing_moe/glm4_7_example.py
@@ -1,6 +1,6 @@
 import argparse
 
-from datasets import concatenate_datasets, load_dataset
+from datasets import load_dataset, concatenate_datasets
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot

From e2e1b77ada47a26383e4574cbedf7b84a861a764 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 13 Jan 2026 11:30:31 -0600
Subject: [PATCH 12/17] Removed AWQMappings and utilized AWQModifier.  Also
 updated auto_dtype to just dtype. Signed-off-by: phaelon74
 <33295008+phaelon74@users.noreply.github.com>

---
 examples/quantizing_moe/glm4_7_example.py | 26 ++++++-----------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py
index 3726b7f989..bf7a512b8e 100644
--- a/examples/quantizing_moe/glm4_7_example.py
+++ b/examples/quantizing_moe/glm4_7_example.py
@@ -4,7 +4,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers.awq import AWQMapping, AWQModifier
+from llmcompressor.modifiers.awq import AWQModifier
 from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE  # noqa: F401
 
 # This script does W4A16 AWQ quantization of the GLM-4.7 model.
@@ -46,7 +46,7 @@
 # Model (GLM / GLM-MoE)
 # =========================
 MODEL_ID = model_path
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # =========================
@@ -159,9 +159,10 @@ def preprocess_rombo(batch):
 # =========================
 
 moe_ignores = [
-    # Layers 0-2: Dense layer - ignore attention and MLP
-    "model.layers.[0-2].self_attn.(q|k|v|o)_proj",
-    "model.layers.[0-2].mlp.(gate|up|down)_proj",
+    # Layers 0-2: Dense layer - ignore entire layers
+    "model.layers.0.*",
+    "model.layers.1.*",
+    "model.layers.2.*",
 
     # Layers 3-91: MoE layers - ignore shared_experts
     "re:.*model.layers.([3-9]|[1-8][0-9]|9[01]).mlp.shared_experts.(gate|up|down)_proj",
@@ -170,24 +171,9 @@ def preprocess_rombo(batch):
     "lm_head",
 ]
 
-# Create explicit mappings that skip layers 0-2
-mappings = []
-for layer_idx in range(3, 92):  # Skip dense layers 0-2
-    mappings.append(
-        AWQMapping(
-            smooth_layer=f"model.layers.{layer_idx}.input_layernorm",
-            balance_layers=[
-                f"model.layers.{layer_idx}.self_attn.q_proj",
-                f"model.layers.{layer_idx}.self_attn.k_proj",
-                f"model.layers.{layer_idx}.self_attn.v_proj",
-            ]
-        )
-    )
-
 recipe = [
     AWQModifier(
         ignore=moe_ignores,
-        mappings=mappings,  # Provide explicit mappings
         config_groups={
             "group_0": {
                 "targets": ["Linear"],

From db7de05742d139738711659a4b59d274e1098572 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 13 Jan 2026 11:37:47 -0600
Subject: [PATCH 13/17] Created a helper function for fixing the generation
 config. Signed-off-by: phaelon74
 <33295008+phaelon74@users.noreply.github.com>

---
 examples/quantizing_moe/glm4_7_example.py | 33 ++++++++++++++++-------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py
index bf7a512b8e..dc4c76b0f2 100644
--- a/examples/quantizing_moe/glm4_7_example.py
+++ b/examples/quantizing_moe/glm4_7_example.py
@@ -21,6 +21,28 @@
 # to apply it during quantization.
 
 
+def fix_generation_config(model):
+    """
+    Fix generation config validation issue before saving.
+
+    If temperature is set but do_sample is False, enable do_sample=True
+    to make temperature valid. This prevents validation errors when saving
+    models with generation configs.
+
+    :param model: The model to fix generation_config for
+    """
+    if hasattr(model, 'generation_config') and model.generation_config is not None:
+        # If temperature is set but do_sample is False, either enable do_sample
+        # or remove temperature
+        if (
+            hasattr(model.generation_config, 'temperature')
+            and model.generation_config.temperature is not None
+        ):
+            if not getattr(model.generation_config, 'do_sample', False):
+                # Set do_sample=True to make temperature valid, or remove temperature
+                model.generation_config.do_sample = True
+
+
 # =========================
 # Parse Command-Line Arguments
 # =========================
@@ -205,16 +227,7 @@ def preprocess_rombo(batch):
 )
 
 # Fix generation config validation issue before saving
-if hasattr(model, 'generation_config') and model.generation_config is not None:
-    # If temperature is set but do_sample is False, either enable do_sample
-    # or remove temperature
-    if (
-        hasattr(model.generation_config, 'temperature')
-        and model.generation_config.temperature is not None
-    ):
-        if not getattr(model.generation_config, 'do_sample', False):
-            # Set do_sample=True to make temperature valid, or remove temperature
-            model.generation_config.do_sample = True
+fix_generation_config(model)
 
 # (Optional redundant save)
 model.save_pretrained(SAVE_DIR, save_compressed=True)

From 19be730b7f26adaf29ee2888d89d71f982d14869 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 20 Jan 2026 11:01:48 -0500
Subject: [PATCH 14/17] Simplified the GLM4_7 Example script. Signed-off-by:
 phaelon74 <33295008+phaelon74@users.noreply.github.com>

---
 examples/quantizing_moe/glm4_7_example.py | 258 ++++------------------
 1 file changed, 47 insertions(+), 211 deletions(-)

diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py
index dc4c76b0f2..820b5d9e6e 100644
--- a/examples/quantizing_moe/glm4_7_example.py
+++ b/examples/quantizing_moe/glm4_7_example.py
@@ -1,237 +1,73 @@
-import argparse
-
-from datasets import load_dataset, concatenate_datasets
+from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
-from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE  # noqa: F401
-
-# This script does W4A16 AWQ quantization of the GLM-4.7 model.
-# It uses Group Size of 32 and two datasets (one specific for quantization
-# and one for reasoning models).
-# Running this script on an RTX PRO 6000 Workstation cards sees up to 40GB
-# of VRAM used and roughly ~3.5 hours of run time.
-# This model script uses the glm4 modeling file to make sure that for each
-# calibration sample, all experts are engaged.
-# This script accepts command-line arguments for source and destination
-# directories.
-# GLM 4.7 has Dense layers for the first three layers, so we skip multiple
-# sections of those layers. We then need to add all of that to a mapping,
-# to apply it during quantization.
-
-
-def fix_generation_config(model):
-    """
-    Fix generation config validation issue before saving.
-
-    If temperature is set but do_sample is False, enable do_sample=True
-    to make temperature valid. This prevents validation errors when saving
-    models with generation configs.
-
-    :param model: The model to fix generation_config for
-    """
-    if hasattr(model, 'generation_config') and model.generation_config is not None:
-        # If temperature is set but do_sample is False, either enable do_sample
-        # or remove temperature
-        if (
-            hasattr(model.generation_config, 'temperature')
-            and model.generation_config.temperature is not None
-        ):
-            if not getattr(model.generation_config, 'do_sample', False):
-                # Set do_sample=True to make temperature valid, or remove temperature
-                model.generation_config.do_sample = True
-
-
-# =========================
-# Parse Command-Line Arguments
-# =========================
-parser = argparse.ArgumentParser(
-    description="Run W4A16 AWQ quantization on GLM-4.7 model."
-)
-parser.add_argument(
-    "model_path",
-    type=str,
-    help="Path to the source model directory."
-)
-parser.add_argument(
-    "output_path",
-    type=str,
-    help="Path to the destination directory for saving quantized model."
-)
-
-args = parser.parse_args()
-model_path = args.model_path
-output_path = args.output_path
-
-# =========================
-# Model (GLM / GLM-MoE)
-# =========================
-MODEL_ID = model_path
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-
-# =========================
-# Calibration data (Neural Magic + Rombo Optimized Reasoning)
-# =========================
-NUM_CALIBRATION_SAMPLES = 512
+from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE
+
+# Load the model
+model_id = "zai-org/GLM-4.7"
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# MoE calibration is now handled automatically by the pipeline.
+# The `CalibrationGlm4MoeMoE` modules (from `llmcompressor.modeling.glm4_moe`)
+# will be applied during calibration to enable proper expert calibration.
+# These replace the original `Glm4MoeMoE` class from
+# `transformers.models.glm4_moe.modeling_glm4_moe`.
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 5
 MAX_SEQUENCE_LENGTH = 2048
 
-# Calculate sample distribution: 60% Neural Magic, 40% Rombo
-NUM_NEURALMAGIC = int(NUM_CALIBRATION_SAMPLES * 0.6)  # ~307 samples
-NUM_ROMBO = NUM_CALIBRATION_SAMPLES - NUM_NEURALMAGIC  # ~205 samples
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
 
-print(
-    f"Loading calibration datasets: {NUM_NEURALMAGIC} from Neural Magic, "
-    f"{NUM_ROMBO} from Rombo"
-)
 
-# Load Neural Magic dataset
-neuralmagic_dataset_id = "neuralmagic/LLM_compression_calibration"
-neuralmagic_split = "train"
-ds_neuralmagic = load_dataset(neuralmagic_dataset_id, split=neuralmagic_split)
-
-# Sample from Neural Magic dataset
-n_nm = min(NUM_NEURALMAGIC, len(ds_neuralmagic))
-ds_neuralmagic = ds_neuralmagic.shuffle(seed=42).select(range(n_nm))
-
-# Render messages to chat-style text (batch)
-# The neuralmagic dataset has "messages" field with user/assistant roles
-def preprocess_neuralmagic(batch):
-    rendered = []
-    for messages in batch["messages"]:
-        # Apply chat template to the messages directly
-        text = tokenizer.apply_chat_template(
-            messages,
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
             tokenize=False,
-            add_generation_prompt=False,
         )
-        rendered.append(text)
-    return {"text": rendered}
-
-ds_neuralmagic = ds_neuralmagic.map(preprocess_neuralmagic, batched=True, num_proc=4)
-
-# Load Rombo Optimized Reasoning dataset
-rombo_dataset_id = "Rombo-Org/Optimized_Reasoning"
-rombo_split = "train"
-ds_rombo = load_dataset(rombo_dataset_id, split=rombo_split)
-
-# Sample from Rombo dataset
-n_rombo = min(NUM_ROMBO, len(ds_rombo))
-ds_rombo = ds_rombo.shuffle(seed=43).select(range(n_rombo))
-
-# Preprocess Rombo dataset
-# Format: {"instruction": "", "input": [""], "output": [""]}
-def preprocess_rombo(batch):
-    rendered = []
-    for instruction, inputs, outputs in zip(
-        batch["instruction"], batch["input"], batch["output"]
-    ):
-        # Construct text from instruction, input, and output
-        # Combine instruction with all input/output pairs
-        text_parts = [instruction]
-
-        # Handle input array (may contain multiple items)
-        if isinstance(inputs, list) and len(inputs) > 0:
-            for inp in inputs:
-                if inp and inp.strip():
-                    text_parts.append(f"\n\nInput: {inp}")
-
-        # Handle output array (may contain multiple items)
-        if isinstance(outputs, list) and len(outputs) > 0:
-            for out in outputs:
-                if out and out.strip():
-                    text_parts.append(f"\n\nOutput: {out}")
-
-        # Join all parts
-        text = "".join(text_parts)
-        rendered.append(text)
-    return {"text": rendered}
-
-ds_rombo = ds_rombo.map(preprocess_rombo, batched=True, num_proc=4)
-
-# Combine both datasets
-ds = concatenate_datasets([ds_neuralmagic, ds_rombo])
-
-# Shuffle the combined dataset
-ds = ds.shuffle(seed=44)
-
-# Tokenize in batches
-ds = ds.map(
-    lambda batch: tokenizer(
-        batch["text"],
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
         padding=False,
         max_length=MAX_SEQUENCE_LENGTH,
         truncation=True,
         add_special_tokens=False,
-    ),
-    batched=True,
-    remove_columns=ds.column_names,
-    num_proc=4,
-)
+    )
+
 
-print(f"Combined calibration dataset: {len(ds)} samples")
-
-# =========================
-# AWQ recipe with config_groups
-#  - Weight-only INT4 (W4A16 **symmetric**)
-#  - group_size: 32
-#  - IMPORTANT: do NOT ignore mlp.gate / gate_up_proj (merged layer)
-#  - Keep router and output head unquantized
-# =========================
-
-moe_ignores = [
-    # Layers 0-2: Dense layer - ignore entire layers
-    "model.layers.0.*",
-    "model.layers.1.*",
-    "model.layers.2.*",
-
-    # Layers 3-91: MoE layers - ignore shared_experts
-    "re:.*model.layers.([3-9]|[1-8][0-9]|9[01]).mlp.shared_experts.(gate|up|down)_proj",
-
-    # Ignore the output head
-    "lm_head",
-]
-
-recipe = [
-    AWQModifier(
-        ignore=moe_ignores,
-        config_groups={
-            "group_0": {
-                "targets": ["Linear"],
-                "weights": {
-                    "num_bits": 4,
-                    "type": "int",
-                    "symmetric": True,   # W4A16 (symmetric)
-                    "strategy": "group",
-                    "group_size": 32,
-                    "dynamic": False,
-                },
-            },
-        },
-    ),
-]
-
-# =========================
-# Quantize + save (writes quantization_config for vLLM)
-# =========================
-SAVE_DIR = output_path
+ds = ds.map(tokenize, remove_columns=ds.column_names)
 
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with GPTQ with a group size 128
+recipe = AWQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+# Apply algorithms.
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-#    output_dir=SAVE_DIR,
 )
 
-# Fix generation config validation issue before saving
-fix_generation_config(model)
-
-# (Optional redundant save)
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
-
-print("Saved to:", SAVE_DIR)
-

From 014cfea833b929c31d47d27db5c5a85557d1b0e9 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 20 Jan 2026 11:11:11 -0500
Subject: [PATCH 15/17] Adding MOE_ignore layers back into Simplified Script.
 Signed-off-by: phaelon74 <33295008+phaelon74@users.noreply.github.com>

---
 examples/quantizing_moe/glm4_7_example.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py
index 820b5d9e6e..0b4bb212ec 100644
--- a/examples/quantizing_moe/glm4_7_example.py
+++ b/examples/quantizing_moe/glm4_7_example.py
@@ -21,7 +21,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 5
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -54,9 +54,19 @@ def tokenize(sample):
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
+moe_ignores = [
+    # Layers 0-2: Dense layer - ignore entire layers
+    "model.layers.0.*",
+    "model.layers.1.*",
+    "model.layers.2.*",
+
+    # Ignore the output head
+    "lm_head",
+]
+
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
-recipe = AWQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+recipe = AWQModifier(targets="Linear", scheme="W4A16", ignore=moe_ignores)
 
 # Apply algorithms.
 oneshot(

From 4f0bb663cce0f6f91089512032a48fa178599ac6 Mon Sep 17 00:00:00 2001
From: phaelon74 <33295008+phaelon74@users.noreply.github.com>
Date: Tue, 20 Jan 2026 12:27:31 -0500
Subject: [PATCH 16/17] Fixing Import Order. Signed-off-by: phaelon74
 <33295008+phaelon74@users.noreply.github.com>

---
 examples/quantizing_moe/glm4_7_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py
index 0b4bb212ec..247d992e36 100644
--- a/examples/quantizing_moe/glm4_7_example.py
+++ b/examples/quantizing_moe/glm4_7_example.py
@@ -3,7 +3,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
-from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE
+from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE  # noqa: F401
 
 # Load the model
 model_id = "zai-org/GLM-4.7"

From 083c35bd91b4b3022f091dea14a9c290217f903d Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 20 Jan 2026 13:43:10 -0500
Subject: [PATCH 17/17] Format

---
 examples/quantizing_moe/glm4_7_example.py           | 3 +--
 src/llmcompressor/modeling/glm4_moe.py              | 1 -
 tests/llmcompressor/modeling/test_calib_glm4_moe.py | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/quantizing_moe/glm4_7_example.py b/examples/quantizing_moe/glm4_7_example.py
index 247d992e36..c3ed6ef8c1 100644
--- a/examples/quantizing_moe/glm4_7_example.py
+++ b/examples/quantizing_moe/glm4_7_example.py
@@ -2,8 +2,8 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers.awq import AWQModifier
 from llmcompressor.modeling.glm4_moe import CalibrationGlm4MoeMoE  # noqa: F401
+from llmcompressor.modifiers.awq import AWQModifier
 
 # Load the model
 model_id = "zai-org/GLM-4.7"
@@ -59,7 +59,6 @@ def tokenize(sample):
     "model.layers.0.*",
     "model.layers.1.*",
     "model.layers.2.*",
-
     # Ignore the output head
     "lm_head",
 ]
diff --git a/src/llmcompressor/modeling/glm4_moe.py b/src/llmcompressor/modeling/glm4_moe.py
index b588534934..4f4e470d50 100644
--- a/src/llmcompressor/modeling/glm4_moe.py
+++ b/src/llmcompressor/modeling/glm4_moe.py
@@ -90,4 +90,3 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module:
         the calibration context to restore the original MoE module.
         """
         return original
-
diff --git a/tests/llmcompressor/modeling/test_calib_glm4_moe.py b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
index 91858ca6c4..8ca1c92cb0 100644
--- a/tests/llmcompressor/modeling/test_calib_glm4_moe.py
+++ b/tests/llmcompressor/modeling/test_calib_glm4_moe.py
@@ -89,4 +89,3 @@ def test_calib_glm4moe_module():
     with calibration_forward_context(module):
         output = module(sample)
         assert torch.allclose(true_output, output, atol=1e-6)
-