diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index ae93d2353a3..c77727e216c 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -608,6 +608,7 @@ steps:
         timeout_in_minutes: 60
         commands:
           - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
+          - pytest -sv tests/e2e/offline_inference/test_glm_image_autoround_w4a16_expansion.py -m "full_model and diffusion and L4" --run-level "full_model"
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
diff --git a/docs/user_guide/quantization/autoround.md b/docs/user_guide/quantization/autoround.md
index 88fed3b62b3..3afc8b06233 100644
--- a/docs/user_guide/quantization/autoround.md
+++ b/docs/user_guide/quantization/autoround.md
@@ -52,8 +52,8 @@ vLLM-Omni's runtime module names.
 
 | Model | Scope | Status | Notes |
 |-------|-------|--------|-------|
+| GLM-Image | Diffusion transformer  | ✅ | `Intel/GLM-Image-int4-AutoRound` |
 | BAGEL | Checkpoint-defined diffusion or transformer stage | Not validated | Requires a compatible AutoRound checkpoint |
-| GLM-Image | Checkpoint-defined diffusion or transformer stage | Not validated | Requires a compatible AutoRound checkpoint |
 
 ## Configuration
 
diff --git a/tests/diffusion/models/glm_image/test_glm_image_quantization.py b/tests/diffusion/models/glm_image/test_glm_image_quantization.py
new file mode 100644
index 00000000000..853150af78f
--- /dev/null
+++ b/tests/diffusion/models/glm_image/test_glm_image_quantization.py
@@ -0,0 +1,699 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for GLM-Image quantization support (W4A16/AutoRound).
+
+These tests verify that the GLM-Image DiT transformer correctly accepts and uses
+quantization configs for W4A16/AutoRound quantization support.
+"""
+
+import pytest
+import torch
+from pytest_mock import MockerFixture
+
+from vllm_omni.diffusion.data import DiffusionParallelConfig
+from vllm_omni.diffusion.models.glm_image.glm_image_transformer import (
+    ColumnParallelGELU,
+    ColumnParallelSiLU,
+    GlmImageAdaLayerNormContinuous,
+    GlmImageAdaLayerNormZero,
+    GlmImageAttention,
+    GlmImageFeedForward,
+    GlmImageImageProjector,
+    GlmImagePrepare,
+    GlmImageRotaryPosEmbed,
+    GlmImageTransformer2DModel,
+    GlmImageTransformerBlock,
+    _positive_divisors,
+    validate_glm_image_tp_constraints,
+)
+from vllm_omni.model_executor.models.glm_image.pipeline import GLM_IMAGE_PIPELINE
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+@pytest.fixture(scope="function", autouse=True)
+def setup_mocks(mocker: MockerFixture):
+    """Set up common mocks for all tests."""
+    mocker.patch(
+        "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size",
+        return_value=1,
+    )
+    mock_get_tp_group = mocker.patch("vllm.distributed.parallel_state.get_tp_group")
+    mock_tp_group = mocker.MagicMock()
+    mock_tp_group.world_size = 1
+    mock_get_tp_group.return_value = mock_tp_group
+    yield
+
+
+class TestPositiveDivisors:
+    """Test _positive_divisors helper function."""
+
+    def test_divisors_of_1(self):
+        assert _positive_divisors(1) == {1}
+
+    def test_divisors_of_12(self):
+        assert _positive_divisors(12) == {1, 2, 3, 4, 6, 12}
+
+    def test_divisors_of_prime(self):
+        assert _positive_divisors(7) == {1, 7}
+
+    def test_divisors_of_0_returns_empty(self):
+        assert _positive_divisors(0) == set()
+
+    def test_divisors_of_negative_returns_empty(self):
+        assert _positive_divisors(-5) == set()
+
+
+class TestValidateGlmImageTpConstraints:
+    """Test TP constraint validation for GLM-Image."""
+
+    def test_valid_tp_size_1(self):
+        """TP=1 should always be valid."""
+        result = validate_glm_image_tp_constraints(
+            dim=2560,
+            num_heads=64,
+            ffn_hidden_dim=10240,
+            tensor_parallel_size=1,
+        )
+        assert 1 in result
+
+    def test_valid_tp_size_2_for_divisible_dim(self):
+        """TP=2 is valid when all dims are divisible by 2."""
+        result = validate_glm_image_tp_constraints(
+            dim=2560,
+            num_heads=64,
+            ffn_hidden_dim=10240,
+            tensor_parallel_size=2,
+        )
+        assert 1 in result
+        assert 2 in result
+
+    def test_valid_tp_size_4_for_divisible_dim(self):
+        """TP=4 is valid when all dims are divisible by 4."""
+        result = validate_glm_image_tp_constraints(
+            dim=2560,
+            num_heads=64,
+            ffn_hidden_dim=10240,
+            tensor_parallel_size=4,
+        )
+        assert 1 in result
+        assert 2 in result
+        assert 4 in result
+
+    def test_invalid_tp_size_3_for_divisible_dim(self):
+        """TP=3 is invalid when dim is not divisible by 3."""
+        with pytest.raises(ValueError, match="dim % tensor_parallel_size == 0"):
+            validate_glm_image_tp_constraints(
+                dim=2560,  # 2560 % 3 != 0
+                num_heads=64,
+                ffn_hidden_dim=10240,
+                tensor_parallel_size=3,
+            )
+
+    def test_invalid_tp_size_zero(self):
+        """TP=0 should raise error."""
+        with pytest.raises(ValueError, match="tensor_parallel_size must be > 0"):
+            validate_glm_image_tp_constraints(
+                dim=2560,
+                num_heads=64,
+                ffn_hidden_dim=10240,
+                tensor_parallel_size=0,
+            )
+
+    def test_invalid_tp_size_negative(self):
+        """Negative TP size should raise error."""
+        with pytest.raises(ValueError, match="tensor_parallel_size must be > 0"):
+            validate_glm_image_tp_constraints(
+                dim=2560,
+                num_heads=64,
+                ffn_hidden_dim=10240,
+                tensor_parallel_size=-1,
+            )
+
+
+class TestGlmImageAdaLayerNormZeroQuantization:
+    """Test GlmImageAdaLayerNormZero with quantization config."""
+
+    def test_accepts_quant_config_parameter(self, mocker: MockerFixture):
+        """Verify the class accepts quant_config parameter."""
+        mock_quant_config = mocker.MagicMock()
+        layer = GlmImageAdaLayerNormZero(
+            embedding_dim=512,
+            dim=2560,
+            quant_config=mock_quant_config,
+            prefix="test.norm1",
+        )
+        assert layer.linear.quant_config is mock_quant_config
+
+    def test_accepts_none_quant_config(self):
+        """Verify quant_config=None is accepted."""
+        layer = GlmImageAdaLayerNormZero(
+            embedding_dim=512,
+            dim=2560,
+            quant_config=None,
+        )
+        assert layer.linear.quant_config is None
+
+    def test_forward_handles_tuple_return_from_linear(self):
+        """Verify forward handles tuple returns from ReplicatedLinear."""
+        layer = GlmImageAdaLayerNormZero(
+            embedding_dim=512,
+            dim=2560,
+            quant_config=None,
+        )
+
+        batch_size = 2
+        seq_len = 10
+        hidden_states = torch.randn(batch_size, seq_len, 2560)
+        encoder_hidden_states = torch.randn(batch_size, seq_len, 2560)
+        temb = torch.randn(batch_size, 512)
+
+        # This should work regardless of whether linear returns tuple or tensor
+        result = layer(hidden_states, encoder_hidden_states, temb)
+        assert len(result) == 10  # Should return 10 chunks
+
+
+class TestGlmImageAdaLayerNormContinuousQuantization:
+    """Test GlmImageAdaLayerNormContinuous with quantization config."""
+
+    def test_accepts_quant_config_parameter(self, mocker: MockerFixture):
+        """Verify the class accepts quant_config parameter."""
+        mock_quant_config = mocker.MagicMock()
+        layer = GlmImageAdaLayerNormContinuous(
+            embedding_dim=2560,
+            conditioning_embedding_dim=512,
+            quant_config=mock_quant_config,
+            prefix="test.norm_out",
+        )
+        assert layer.linear.quant_config is mock_quant_config
+
+    def test_accepts_none_quant_config(self):
+        """Verify quant_config=None is accepted."""
+        layer = GlmImageAdaLayerNormContinuous(
+            embedding_dim=2560,
+            conditioning_embedding_dim=512,
+            quant_config=None,
+        )
+        assert layer.linear.quant_config is None
+
+    def test_forward_handles_tuple_return_from_linear(self):
+        """Verify forward handles tuple returns from ReplicatedLinear."""
+        layer = GlmImageAdaLayerNormContinuous(
+            embedding_dim=2560,
+            conditioning_embedding_dim=512,
+            quant_config=None,
+        )
+
+        batch_size = 2
+        seq_len = 10
+        x = torch.randn(batch_size, seq_len, 2560)
+        conditioning_embedding = torch.randn(batch_size, 512)
+
+        result = layer(x, conditioning_embedding)
+        assert result.shape == (batch_size, seq_len, 2560)
+
+
+class TestGlmImageAttentionQuantization:
+    """Test GlmImageAttention with quantization config."""
+
+    def test_accepts_quant_config_parameter(self, mocker: MockerFixture):
+        """Verify GlmImageAttention accepts quant_config parameter."""
+        mock_quant_config = mocker.MagicMock()
+        attn = GlmImageAttention(
+            dim=2560,
+            num_heads=64,
+            head_dim=40,
+            quant_config=mock_quant_config,
+            prefix="test.attn1",
+        )
+        assert attn.to_qkv.quant_config is mock_quant_config
+
+    def test_accepts_none_quant_config(self):
+        """Verify quant_config=None is accepted."""
+        attn = GlmImageAttention(
+            dim=2560,
+            num_heads=64,
+            head_dim=40,
+            quant_config=None,
+        )
+        assert attn.to_qkv.quant_config is None
+
+
+class TestColumnParallelModulesQuantization:
+    """Test ColumnParallelGELU and ColumnParallelSiLU with quantization config."""
+
+    def test_column_parallel_gelu_accepts_quant_config(self, mocker: MockerFixture):
+        """Verify ColumnParallelGELU accepts quant_config."""
+        mock_quant_config = mocker.MagicMock()
+        layer = ColumnParallelGELU(
+            dim_in=2560,
+            dim_out=10240,
+            quant_config=mock_quant_config,
+            prefix="test.gelu",
+        )
+        assert layer.proj.quant_config is mock_quant_config
+
+    def test_column_parallel_silu_accepts_quant_config(self, mocker: MockerFixture):
+        """Verify ColumnParallelSiLU accepts quant_config."""
+        mock_quant_config = mocker.MagicMock()
+        layer = ColumnParallelSiLU(
+            dim_in=2560,
+            dim_out=10240,
+            quant_config=mock_quant_config,
+            prefix="test.silu",
+        )
+        assert layer.proj.quant_config is mock_quant_config
+
+
+class TestGlmImageFeedForwardQuantization:
+    """Test GlmImageFeedForward with quantization config."""
+
+    def test_accepts_quant_config_parameter(self, mocker: MockerFixture):
+        """Verify GlmImageFeedForward accepts quant_config parameter."""
+        mock_quant_config = mocker.MagicMock()
+        ff = GlmImageFeedForward(
+            dim=2560,
+            dim_out=2560,
+            inner_dim=10240,
+            activation_fn="gelu-approximate",
+            quant_config=mock_quant_config,
+            prefix="test.ff",
+        )
+        # Check that the first layer (ColumnParallelGELU) has quant_config
+        gelu_layer = ff.net[0]
+        assert gelu_layer.proj.quant_config is mock_quant_config
+
+    def test_accepts_none_quant_config(self):
+        """Verify quant_config=None is accepted."""
+        ff = GlmImageFeedForward(
+            dim=2560,
+            dim_out=2560,
+            inner_dim=10240,
+            activation_fn="gelu-approximate",
+            quant_config=None,
+        )
+        gelu_layer = ff.net[0]
+        assert gelu_layer.proj.quant_config is None
+
+    def test_linear_silu_activation(self):
+        """Test linear-silu activation function initialization works."""
+        # This test verifies the module can be instantiated with linear-silu activation.
+        # Full forward testing requires proper TP group setup, so we just verify construction.
+        ff = GlmImageFeedForward(
+            dim=2560,
+            dim_out=2560,
+            inner_dim=10240,
+            activation_fn="linear-silu",
+            quant_config=None,
+        )
+        # Verify the FFN has the correct structure
+        assert len(ff.net) == 3  # ColumnParallelSiLU, Identity, RowParallelLinear
+        assert isinstance(ff.net[0], ColumnParallelSiLU)
+
+
+class TestGlmImageTransformerBlockQuantization:
+    """Test GlmImageTransformerBlock with quantization config."""
+
+    def test_accepts_quant_config_parameter(self, mocker: MockerFixture):
+        """Verify GlmImageTransformerBlock accepts quant_config parameter."""
+        mock_quant_config = mocker.MagicMock()
+        parallel_config = DiffusionParallelConfig(
+            tensor_parallel_size=1,
+            sequence_parallel_size=1,
+        )
+        block = GlmImageTransformerBlock(
+            dim=2560,
+            num_attention_heads=64,
+            attention_head_dim=40,
+            time_embed_dim=512,
+            parallel_config=parallel_config,
+            quant_config=mock_quant_config,
+            prefix="test.block",
+        )
+        # Check that inner modules have quant_config
+        assert block.norm1.linear.quant_config is mock_quant_config
+        assert block.attn1.to_qkv.quant_config is mock_quant_config
+        assert block.ff.net[0].proj.quant_config is mock_quant_config
+
+    def test_accepts_none_quant_config(self):
+        """Verify quant_config=None is accepted."""
+        block = GlmImageTransformerBlock(
+            dim=2560,
+            num_attention_heads=64,
+            attention_head_dim=40,
+            time_embed_dim=512,
+            quant_config=None,
+        )
+        assert block.norm1.linear.quant_config is None
+
+
+class TestGlmImagePrepareModule:
+    """Test GlmImagePrepare module."""
+
+    def test_prepare_module_exists(self):
+        """Verify GlmImagePrepare module exists and works."""
+        projector = GlmImageImageProjector(in_channels=16, hidden_size=2560, patch_size=2)
+        rope = GlmImageRotaryPosEmbed(dim=40, patch_size=2)
+        prepare = GlmImagePrepare(
+            image_projector=projector,
+            rope=rope,
+            patch_size=2,
+        )
+
+        # Test forward pass
+        hidden_states = torch.randn(1, 16, 64, 64)  # [B, C, H, W]
+        result = prepare(hidden_states)
+
+        assert len(result) == 5
+        hidden_out, rope_cos, rope_sin, height, width = result
+        assert hidden_out.shape[0] == 1  # batch size
+        assert hidden_out.shape[1] == 1024  # seq_len (32 * 32)
+        assert hidden_out.shape[2] == 2560  # hidden_dim
+
+
+class TestGlmImagePipelineConfig:
+    """Test GLM_IMAGE_PIPELINE configuration."""
+
+    def test_glm_image_pipeline_config_exists(self):
+        """Verify GLM_IMAGE_PIPELINE config exists."""
+        assert GLM_IMAGE_PIPELINE is not None
+
+    def test_pipeline_has_correct_model_type(self):
+        """Verify pipeline has correct model_type."""
+        assert GLM_IMAGE_PIPELINE.model_type == "glm_image"
+
+    def test_pipeline_has_correct_model_arch(self):
+        """Verify pipeline has correct model_arch."""
+        assert GLM_IMAGE_PIPELINE.model_arch == "GlmImageForConditionalGeneration"
+
+    def test_pipeline_has_two_stages(self):
+        """Verify pipeline has two stages (AR + DiT)."""
+        assert len(GLM_IMAGE_PIPELINE.stages) == 2
+
+    def test_stage_0_is_ar_llm(self):
+        """Verify stage 0 is the AR LLM stage."""
+        stage_0 = GLM_IMAGE_PIPELINE.stages[0]
+        assert stage_0.stage_id == 0
+        assert stage_0.model_stage == "ar"
+        assert stage_0.execution_type.value == "llm_ar"
+        assert stage_0.owns_tokenizer is True
+
+    def test_stage_1_is_diffusion(self):
+        """Verify stage 1 is the DiT diffusion stage."""
+        stage_1 = GLM_IMAGE_PIPELINE.stages[1]
+        assert stage_1.stage_id == 1
+        assert stage_1.model_stage == "dit"
+        assert stage_1.execution_type.value == "diffusion"
+        assert stage_1.final_output is True
+        assert stage_1.final_output_type == "image"
+        assert stage_1.input_sources == (0,)  # Takes input from stage 0
+
+    def test_pipeline_has_diffusers_class_name(self):
+        """Verify pipeline has diffusers_class_name."""
+        assert GLM_IMAGE_PIPELINE.diffusers_class_name == "GlmImagePipeline"
+
+
+class TestGlmImageImageProjector:
+    """Test GlmImageImageProjector module."""
+
+    def test_projector_output_shape(self):
+        """Verify projector produces correct output shape."""
+        projector = GlmImageImageProjector(
+            in_channels=16,
+            hidden_size=2560,
+            patch_size=2,
+        )
+
+        # Input: [B, C, H, W] = [1, 16, 64, 64]
+        hidden_states = torch.randn(1, 16, 64, 64)
+        output = projector(hidden_states)
+
+        # After patchify: [B, H/2*W/2, D] = [1, 32*32, 2560] = [1, 1024, 2560]
+        assert output.shape == (1, 1024, 2560)
+
+    def test_projector_with_different_patch_sizes(self):
+        """Verify projector works with different patch sizes."""
+        for patch_size in [2, 4]:
+            projector = GlmImageImageProjector(
+                in_channels=16,
+                hidden_size=2560,
+                patch_size=patch_size,
+            )
+            h, w = 64, 64
+            hidden_states = torch.randn(1, 16, h, w)
+            output = projector(hidden_states)
+            expected_seq_len = (h // patch_size) * (w // patch_size)
+            assert output.shape[1] == expected_seq_len
+
+
+class TestGlmImageRotaryPosEmbed:
+    """Test GlmImageRotaryPosEmbed module."""
+
+    def test_rope_output_shape(self):
+        """Verify RoPE produces correct output shape."""
+        rope = GlmImageRotaryPosEmbed(dim=40, patch_size=2, theta=10000.0)
+        hidden_states = torch.randn(1, 16, 64, 64)  # [B, C, H, W]
+
+        cos, sin = rope(hidden_states)
+
+        # After patchify: height=32, width=32
+        # Output: [32*32, dim] = [1024, 40]
+        assert cos.shape[0] == 1024
+        assert cos.shape[1] == 40
+        assert sin.shape == cos.shape
+
+    def test_rope_value_range(self):
+        """Verify RoPE produces values in valid range."""
+        rope = GlmImageRotaryPosEmbed(dim=40, patch_size=2, theta=10000.0)
+        hidden_states = torch.randn(2, 16, 32, 32)
+
+        cos, sin = rope(hidden_states)
+
+        # cos and sin should be in [-1, 1]
+        assert cos.min() >= -1.0 and cos.max() <= 1.0
+        assert sin.min() >= -1.0 and sin.max() <= 1.0
+
+    def test_rope_consistency(self):
+        """Verify RoPE is consistent for same input."""
+        rope = GlmImageRotaryPosEmbed(dim=40, patch_size=2, theta=10000.0)
+        hidden_states = torch.randn(1, 16, 32, 32)
+
+        cos1, sin1 = rope(hidden_states)
+        cos2, sin2 = rope(hidden_states)
+
+        # Same input should produce same output
+        assert torch.allclose(cos1, cos2)
+        assert torch.allclose(sin1, sin2)
+
+
+class TestGlmImageTransformer2DModelQuantization:
+    """Test GlmImageTransformer2DModel with quantization config."""
+
+    def test_accepts_quant_config_parameter(self, mocker: MockerFixture):
+        """Verify the model accepts quant_config parameter."""
+        from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+        mock_quant_config = mocker.MagicMock()
+        parallel_config = DiffusionParallelConfig(
+            tensor_parallel_size=1,
+            sequence_parallel_size=1,
+        )
+
+        # Create a minimal mock od_config
+        mock_tf_config = mocker.MagicMock()
+        mock_tf_config.patch_size = 2
+        mock_tf_config.in_channels = 16
+        mock_tf_config.out_channels = 16
+        mock_tf_config.num_attention_heads = 64
+        mock_tf_config.attention_head_dim = 40
+        mock_tf_config.time_embed_dim = 512
+        mock_tf_config.condition_dim = 256
+        mock_tf_config.prior_vq_quantizer_codebook_size = 16384
+        mock_tf_config.text_embed_dim = 1024
+        mock_tf_config.num_layers = 2  # Small number for testing
+
+        mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig)
+        mock_od_config.tf_model_config = mock_tf_config
+        mock_od_config.parallel_config = parallel_config
+
+        model = GlmImageTransformer2DModel(
+            od_config=mock_od_config,
+            quant_config=mock_quant_config,
+        )
+
+        # Check that quantization config was passed to transformer blocks
+        for block in model.transformer_blocks:
+            assert block.norm1.linear.quant_config is mock_quant_config
+            assert block.attn1.to_qkv.quant_config is mock_quant_config
+            assert block.ff.net[0].proj.quant_config is mock_quant_config
+
+    def test_accepts_none_quant_config(self, mocker: MockerFixture):
+        """Verify quant_config=None is accepted."""
+        from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+        parallel_config = DiffusionParallelConfig(
+            tensor_parallel_size=1,
+            sequence_parallel_size=1,
+        )
+
+        mock_tf_config = mocker.MagicMock()
+        mock_tf_config.patch_size = 2
+        mock_tf_config.in_channels = 16
+        mock_tf_config.out_channels = 16
+        mock_tf_config.num_attention_heads = 64
+        mock_tf_config.attention_head_dim = 40
+        mock_tf_config.time_embed_dim = 512
+        mock_tf_config.condition_dim = 256
+        mock_tf_config.prior_vq_quantizer_codebook_size = 16384
+        mock_tf_config.text_embed_dim = 1024
+        mock_tf_config.num_layers = 2
+
+        mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig)
+        mock_od_config.tf_model_config = mock_tf_config
+        mock_od_config.parallel_config = parallel_config
+
+        model = GlmImageTransformer2DModel(
+            od_config=mock_od_config,
+            quant_config=None,
+        )
+
+        # Check that quantization config is None
+        for block in model.transformer_blocks:
+            assert block.norm1.linear.quant_config is None
+            assert block.attn1.to_qkv.quant_config is None
+
+    def test_norm_out_has_no_quantization(self, mocker: MockerFixture):
+        """Verify norm_out (output layer) does NOT use quantization to preserve precision."""
+        from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+        mock_quant_config = mocker.MagicMock()
+        parallel_config = DiffusionParallelConfig(
+            tensor_parallel_size=1,
+            sequence_parallel_size=1,
+        )
+
+        mock_tf_config = mocker.MagicMock()
+        mock_tf_config.patch_size = 2
+        mock_tf_config.in_channels = 16
+        mock_tf_config.out_channels = 16
+        mock_tf_config.num_attention_heads = 64
+        mock_tf_config.attention_head_dim = 40
+        mock_tf_config.time_embed_dim = 512
+        mock_tf_config.condition_dim = 256
+        mock_tf_config.prior_vq_quantizer_codebook_size = 16384
+        mock_tf_config.text_embed_dim = 1024
+        mock_tf_config.num_layers = 2
+
+        mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig)
+        mock_od_config.tf_model_config = mock_tf_config
+        mock_od_config.parallel_config = parallel_config
+
+        model = GlmImageTransformer2DModel(
+            od_config=mock_od_config,
+            quant_config=mock_quant_config,
+        )
+
+        # norm_out.linear should NOT have quant_config to preserve output precision
+        assert model.norm_out.linear.quant_config is None
+
+    def test_model_has_sp_plan(self, mocker: MockerFixture):
+        """Verify model has _sp_plan defined for sequence parallelism."""
+        from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+        parallel_config = DiffusionParallelConfig(
+            tensor_parallel_size=1,
+            sequence_parallel_size=1,
+        )
+
+        mock_tf_config = mocker.MagicMock()
+        mock_tf_config.patch_size = 2
+        mock_tf_config.in_channels = 16
+        mock_tf_config.out_channels = 16
+        mock_tf_config.num_attention_heads = 64
+        mock_tf_config.attention_head_dim = 40
+        mock_tf_config.time_embed_dim = 512
+        mock_tf_config.condition_dim = 256
+        mock_tf_config.prior_vq_quantizer_codebook_size = 16384
+        mock_tf_config.text_embed_dim = 1024
+        mock_tf_config.num_layers = 2
+
+        mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig)
+        mock_od_config.tf_model_config = mock_tf_config
+        mock_od_config.parallel_config = parallel_config
+
+        model = GlmImageTransformer2DModel(
+            od_config=mock_od_config,
+            quant_config=None,
+        )
+
+        # Verify _sp_plan exists
+        assert hasattr(model, "_sp_plan")
+        assert "prepare" in model._sp_plan
+        assert "proj_out" in model._sp_plan
+
+    def test_model_has_hsdp_shard_conditions(self, mocker: MockerFixture):
+        """Verify model has _hsdp_shard_conditions for HSDP parallelism."""
+        from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+        parallel_config = DiffusionParallelConfig(
+            tensor_parallel_size=1,
+            sequence_parallel_size=1,
+        )
+
+        mock_tf_config = mocker.MagicMock()
+        mock_tf_config.patch_size = 2
+        mock_tf_config.in_channels = 16
+        mock_tf_config.out_channels = 16
+        mock_tf_config.num_attention_heads = 64
+        mock_tf_config.attention_head_dim = 40
+        mock_tf_config.time_embed_dim = 512
+        mock_tf_config.condition_dim = 256
+        mock_tf_config.prior_vq_quantizer_codebook_size = 16384
+        mock_tf_config.text_embed_dim = 1024
+        mock_tf_config.num_layers = 2
+
+        mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig)
+        mock_od_config.tf_model_config = mock_tf_config
+        mock_od_config.parallel_config = parallel_config
+
+        model = GlmImageTransformer2DModel(
+            od_config=mock_od_config,
+            quant_config=None,
+        )
+
+        assert hasattr(model, "_hsdp_shard_conditions")
+        assert len(model._hsdp_shard_conditions) > 0
+
+    def test_model_creates_kv_cache(self, mocker: MockerFixture):
+        """Verify model can create KV cache for image editing."""
+        from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+        parallel_config = DiffusionParallelConfig(
+            tensor_parallel_size=1,
+            sequence_parallel_size=1,
+        )
+
+        mock_tf_config = mocker.MagicMock()
+        mock_tf_config.patch_size = 2
+        mock_tf_config.in_channels = 16
+        mock_tf_config.out_channels = 16
+        mock_tf_config.num_attention_heads = 64
+        mock_tf_config.attention_head_dim = 40
+        mock_tf_config.time_embed_dim = 512
+        mock_tf_config.condition_dim = 256
+        mock_tf_config.prior_vq_quantizer_codebook_size = 16384
+        mock_tf_config.text_embed_dim = 1024
+        mock_tf_config.num_layers = 2
+
+        mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig)
+        mock_od_config.tf_model_config = mock_tf_config
+        mock_od_config.parallel_config = parallel_config
+
+        model = GlmImageTransformer2DModel(
+            od_config=mock_od_config,
+            quant_config=None,
+        )
+
+        kv_cache = model.create_kv_cache()
+        assert kv_cache is not None
+        assert len(kv_cache) == 2  # num_layers
diff --git a/tests/e2e/offline_inference/test_glm_image_autoround_w4a16_expansion.py b/tests/e2e/offline_inference/test_glm_image_autoround_w4a16_expansion.py
new file mode 100644
index 00000000000..2ea4dc64154
--- /dev/null
+++ b/tests/e2e/offline_inference/test_glm_image_autoround_w4a16_expansion.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""E2E tests for GLM-Image AutoRound W4A16 quantized inference.
+
+These tests cover text-to-image and image-to-image generation with
+the W4A16 quantized GLM-Image model.
+
+Requirements:
+  - 2 CUDA GPUs (H100 or equivalent)
+  - The quantized model checkpoint (Intel/GLM-Image-int4-AutoRound)
+"""
+
+import gc
+import math
+import os
+
+import numpy as np
+import pytest
+from PIL import Image
+from vllm import SamplingParams
+
+from tests.helpers.env import DeviceMemoryMonitor
+from tests.helpers.mark import hardware_test
+from tests.helpers.media import generate_synthetic_image
+from tests.helpers.runtime import OmniRunnerHandler
+from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+QUANTIZED_MODEL = os.environ.get("GLM_IMAGE_AUTOROUND_MODEL", "Intel/GLM-Image-int4-AutoRound")
+
+# Small resolution to keep GPU memory & time manageable
+HEIGHT = 256
+WIDTH = 256
+NUM_STEPS = 2  # minimal for smoke-test
+
+# GLM-Image AR generation config (from generation_config.json)
+GLM_IMAGE_EOS_TOKEN_ID = 16385
+GLM_IMAGE_VISION_VOCAB_SIZE = 16512
+
+_CI_DEPLOY = get_deploy_config_path("glm_image.yaml")
+
+
+def _get_stage_config():
+    """Build a CI-friendly stage config with eager mode for testing."""
+    return modify_stage_config(
+        _CI_DEPLOY,
+        updates={
+            "stages": {
+                0: {"enforce_eager": True},
+                1: {"enforce_eager": True},
+            },
+        },
+    )
+
+
+stage_config = _get_stage_config()
+
+# (model, stage_config_path) for ``omni_runner`` indirect parametrize
+_OMNI_RUNNER_PARAM = (QUANTIZED_MODEL, stage_config)
+
+
+def compute_max_tokens(height: int, width: int, factor: int = 32) -> int:
+    """Compute max_new_tokens for GLM-Image AR text-to-image generation."""
+    token_h = height // factor
+    token_w = width // factor
+    large_tokens = token_h * token_w
+
+    ratio = token_h / token_w if token_w > 0 else 1.0
+    small_token_h = max(1, int(math.sqrt(ratio) * (factor // 2)))
+    small_token_w = max(1, int(math.sqrt(1 / ratio) * (factor // 2)))
+    small_tokens = small_token_h * small_token_w
+
+    return small_tokens + large_tokens + 1
+
+
+def _ar_sampling_params(max_tokens: int, height: int, width: int, seed: int = 42) -> SamplingParams:
+    """Build AR stage SamplingParams for GLM-Image."""
+    return SamplingParams(
+        temperature=0.9,
+        top_p=0.75,
+        top_k=GLM_IMAGE_VISION_VOCAB_SIZE,
+        max_tokens=max_tokens,
+        stop_token_ids=[GLM_IMAGE_EOS_TOKEN_ID],
+        seed=seed,
+        detokenize=False,
+        extra_args={
+            "target_h": height,
+            "target_w": width,
+        },
+    )
+
+
+def _diffusion_sampling_params(
+    height: int = HEIGHT,
+    width: int = WIDTH,
+    num_steps: int = NUM_STEPS,
+    seed: int = 42,
+) -> OmniDiffusionSamplingParams:
+    """Build Diffusion stage OmniDiffusionSamplingParams."""
+    return OmniDiffusionSamplingParams(
+        height=height,
+        width=width,
+        num_inference_steps=num_steps,
+        guidance_scale=0.0,
+        seed=seed,
+    )
+
+
+pytestmark = [
+    pytest.mark.full_model,
+    pytest.mark.diffusion,
+]
+
+
+# ------------------------------------------------------------------
+# Test: text-to-image generation produces a valid image (quantized)
+# ------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("omni_runner", [_OMNI_RUNNER_PARAM], indirect=True)
+@hardware_test(res={"cuda": "H100"}, num_cards=2)
+def test_glm_image_autoround_w4a16_generates_image(omni_runner_handler: OmniRunnerHandler):
+    """Load the W4A16 quantized GLM-Image model and verify it produces a valid image."""
+    gc.collect()
+    current_omni_platform.empty_cache()
+    device_index = current_omni_platform.current_device()
+    current_omni_platform.reset_peak_memory_stats()
+    monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02)
+    monitor.start()
+
+    prompt_dict = {
+        "prompt": "A photo of a cat sitting on a laptop keyboard",
+        "height": HEIGHT,
+        "width": WIDTH,
+        "mm_processor_kwargs": {
+            "target_h": HEIGHT,
+            "target_w": WIDTH,
+        },
+    }
+    ar_params = _ar_sampling_params(
+        max_tokens=compute_max_tokens(HEIGHT, WIDTH),
+        height=HEIGHT,
+        width=WIDTH,
+        seed=42,
+    )
+    diffusion_params = _diffusion_sampling_params(
+        height=HEIGHT,
+        width=WIDTH,
+        num_steps=NUM_STEPS,
+        seed=42,
+    )
+
+    outputs = omni_runner_handler.runner.generate(
+        [prompt_dict],
+        [ar_params, diffusion_params],
+    )
+
+    monitor.stop()
+
+    first_output = outputs[0]
+    assert first_output.final_output_type == "image"
+    req_out = first_output.request_output
+    assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images")
+    images = req_out.images
+
+    assert len(images) >= 1, "Expected at least one generated image"
+    img = images[0]
+    assert isinstance(img, Image.Image)
+    assert img.width == WIDTH, f"Expected width {WIDTH}, got {img.width}"
+    assert img.height == HEIGHT, f"Expected height {HEIGHT}, got {img.height}"
+
+    # Sanity: image should not be blank (all-zero)
+    arr = np.array(img)
+    assert arr.std() > 1.0, "Generated image appears blank (std ≈ 0)"
+
+    gc.collect()
+    current_omni_platform.empty_cache()
+
+
+# ------------------------------------------------------------------
+# Test: image-to-image generation (quantized)
+# ------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("omni_runner", [_OMNI_RUNNER_PARAM], indirect=True)
+@hardware_test(res={"cuda": "H100"}, num_cards=2)
+def test_glm_image_autoround_w4a16_image_to_image(omni_runner_handler: OmniRunnerHandler):
+    """Load the W4A16 quantized GLM-Image and verify image-to-image generation works."""
+    ref_image_arr = generate_synthetic_image(WIDTH, HEIGHT)["np_array"]
+
+    gc.collect()
+    current_omni_platform.empty_cache()
+    current_omni_platform.reset_peak_memory_stats()
+
+    prompt_dict = {
+        "prompt": "Make it look like winter",
+        "multi_modal_data": {"image": ref_image_arr},
+        "height": HEIGHT,
+        "width": WIDTH,
+        "mm_processor_kwargs": {
+            "target_h": HEIGHT,
+            "target_w": WIDTH,
+        },
+    }
+    ar_params = _ar_sampling_params(
+        max_tokens=compute_max_tokens(HEIGHT, WIDTH),
+        height=HEIGHT,
+        width=WIDTH,
+        seed=42,
+    )
+    diffusion_params = _diffusion_sampling_params(
+        height=HEIGHT,
+        width=WIDTH,
+        num_steps=NUM_STEPS,
+        seed=42,
+    )
+
+    outputs = omni_runner_handler.runner.generate(
+        [prompt_dict],
+        [ar_params, diffusion_params],
+    )
+
+    first_output = outputs[0]
+    assert first_output.final_output_type == "image"
+    req_out = first_output.request_output
+    assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images")
+    images = req_out.images
+
+    assert len(images) >= 1, "Expected at least one generated image"
+    img = images[0]
+    assert isinstance(img, Image.Image)
+    assert img.width == WIDTH
+    assert img.height == HEIGHT
+
+    gc.collect()
+    current_omni_platform.empty_cache()
diff --git a/tests/model_executor/models/glm_image/test_glm_image_init.py b/tests/model_executor/models/glm_image/test_glm_image_init.py
new file mode 100644
index 00000000000..eb42e9649fe
--- /dev/null
+++ b/tests/model_executor/models/glm_image/test_glm_image_init.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for GLM-Image model __init__.py lazy import pattern.
+
+The ``__init__.py`` uses ``__getattr__`` for lazy loading to avoid importing
+``transformers.models.glm_image`` at module init, which may not be available
+in all environments.
+"""
+
+import pytest
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+class TestLazyImport:
+    """Test the __getattr__ lazy import pattern in glm_image __init__.py."""
+
+    def test_getattr_exists_and_is_callable(self):
+        """Verify __getattr__ exists and is callable."""
+        import vllm_omni.model_executor.models.glm_image as glm_image_pkg
+
+        assert hasattr(glm_image_pkg, "__getattr__")
+        assert callable(glm_image_pkg.__getattr__)
+
+    def test_getattr_returns_class_for_known_attribute(self):
+        """Verify __getattr__ returns GlmImageForConditionalGeneration for known attribute."""
+        import vllm_omni.model_executor.models.glm_image as glm_image_pkg
+
+        # Call __getattr__ directly to test the lazy import logic
+        result = glm_image_pkg.__getattr__("GlmImageForConditionalGeneration")
+
+        # Verify we get a class (the actual GlmImageForConditionalGeneration)
+        assert result is not None
+        assert isinstance(result, type)
+        assert result.__name__ == "GlmImageForConditionalGeneration"
+
+    def test_getattr_raises_for_unknown_attribute(self):
+        """Verify __getattr__ raises AttributeError for unknown attributes."""
+        import vllm_omni.model_executor.models.glm_image as glm_image_pkg
+
+        # Test unknown attribute via __getattr__ directly
+        with pytest.raises(AttributeError, match="has no attribute"):
+            glm_image_pkg.__getattr__("UnknownClass")
+
+    def test___all___exports_correct_symbols(self):
+        """Verify __all__ contains the expected exported symbols."""
+        import vllm_omni.model_executor.models.glm_image as glm_image_pkg
+
+        assert hasattr(glm_image_pkg, "__all__")
+        assert "GlmImageForConditionalGeneration" in glm_image_pkg.__all__
+
+
+class TestLazyImportDoesNotImportTransformersAtInit:
+    """Verify that importing the package does not eagerly load transformers."""
+
+    def test_glm_image_module_has_getattr(self):
+        """Test that the module has __getattr__ for lazy loading."""
+        import vllm_omni.model_executor.models.glm_image as glm_image_pkg
+
+        # The module should have __getattr__
+        assert hasattr(glm_image_pkg, "__getattr__")
+        assert callable(glm_image_pkg.__getattr__)
+
+        # And __all__
+        assert hasattr(glm_image_pkg, "__all__")
+
+    def test_module_has_proper_structure(self):
+        """Test that the module has proper Python module structure."""
+        import vllm_omni.model_executor.models.glm_image as glm_image_pkg
+
+        # Should be a module
+        assert hasattr(glm_image_pkg, "__name__")
+        assert glm_image_pkg.__name__ == "vllm_omni.model_executor.models.glm_image"
+
+        # Should have __file__ (even if it's a package)
+        assert hasattr(glm_image_pkg, "__file__")
diff --git a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py
index 37ea454de20..152286ec220 100644
--- a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py
+++ b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py
@@ -77,7 +77,7 @@ def load_pipeline(cls, model_path: str, device: str, dtype: torch.dtype) -> Any:
                 os.path.join("transformer", "config.json"),
                 od_config.model,
             )
-            od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict)
+            od_config.set_tf_model_config(TransformerConfig.from_dict(tf_config_dict))
 
         loader = DiffusersPipelineLoader(LoadConfig(), od_config=od_config)
         # load_model will handle dtypes / device placement, put in .eval() mode
diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
index ddb32aa2025..56a7066e390 100644
--- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
+++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py
@@ -15,6 +15,7 @@
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
+    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -33,6 +34,9 @@
 )
 from vllm_omni.diffusion.forward_context import get_forward_context
 
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+
 logger = init_logger(__name__)
 
 
@@ -230,11 +234,24 @@ def forward(
 class GlmImageAdaLayerNormZero(nn.Module):
     """Adaptive LayerNorm with zero initialization for both image and text streams."""
 
-    def __init__(self, embedding_dim: int, dim: int) -> None:
+    def __init__(
+        self,
+        embedding_dim: int,
+        dim: int,
+        quant_config: "QuantizationConfig | None" = None,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
         self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5)
         self.norm_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5)
-        self.linear = nn.Linear(embedding_dim, 12 * dim, bias=True)
+        self.linear = ReplicatedLinear(
+            embedding_dim,
+            12 * dim,
+            bias=True,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear",
+        )
 
     def forward(
         self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, temb: torch.Tensor
@@ -244,6 +261,8 @@ def forward(
         norm_encoder_hidden_states = self.norm_context(encoder_hidden_states).to(dtype=dtype)
 
         emb = self.linear(temb)
+        if isinstance(emb, tuple):
+            emb = emb[0]
         (
             shift_msa,
             c_shift_msa,
@@ -286,14 +305,25 @@ def __init__(
         elementwise_affine: bool = True,
         eps: float = 1e-5,
         bias: bool = True,
+        quant_config: "QuantizationConfig | None" = None,
+        prefix: str = "",
     ):
         super().__init__()
-        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        self.linear = ReplicatedLinear(
+            conditioning_embedding_dim,
+            embedding_dim * 2,
+            bias=bias,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear",
+        )
         self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
 
     def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
         # NO SiLU here
         emb = self.linear(conditioning_embedding.to(x.dtype))
+        if isinstance(emb, tuple):
+            emb = emb[0]
         scale, shift = torch.chunk(emb, 2, dim=1)
         x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
         return x
@@ -469,6 +499,7 @@ def __init__(
         out_bias: bool = True,
         eps: float = 1e-5,
         quant_config: "QuantizationConfig | None" = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.dim = dim
@@ -485,6 +516,7 @@ def __init__(
             bias=True,
             return_bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.to_qkv",
         )
 
         # QK normalization (LayerNorm, not RMSNorm for GLM-Image)
@@ -501,6 +533,7 @@ def __init__(
                     input_is_parallel=True,
                     return_bias=False,
                     quant_config=quant_config,
+                    prefix=f"{prefix}.to_out.0",
                 ),
                 nn.Dropout(0.0),
             ]
@@ -554,7 +587,7 @@ def forward(
         hidden_states_combined = torch.cat([encoder_hidden_states, hidden_states], dim=1)
 
         # QKV projection
-        qkv = self.to_qkv(hidden_states_combined)
+        qkv = self.to_qkv(hidden_states_combined.contiguous())
         q_size = self.to_qkv.num_heads * self.head_dim
         kv_size = self.to_qkv.num_kv_heads * self.head_dim
         query, key, value = qkv.split([q_size, kv_size, kv_size], dim=-1)
@@ -604,8 +637,12 @@ def forward(
             # Project combined [text, image] outputs, then split.
             # This keeps SP numerically aligned with the non-SP path.
             joint_hidden_states_out = joint_hidden_states_out.flatten(2, 3).to(dtype)
+            # Contiguous for FP8/W4A16 quantized RowParallelLinear
             for module in self.to_out:
-                joint_hidden_states_out = module(joint_hidden_states_out)
+                if isinstance(module, RowParallelLinear):
+                    joint_hidden_states_out = module(joint_hidden_states_out.contiguous())
+                else:
+                    joint_hidden_states_out = module(joint_hidden_states_out)
 
             encoder_hidden_states_out = joint_hidden_states_out[:, :text_seq_length, :]
             hidden_states_out = joint_hidden_states_out[:, text_seq_length:, :]
@@ -645,7 +682,10 @@ def forward(
 
             # Output projection
             for module in self.to_out:
-                hidden_states_out = module(hidden_states_out)
+                if isinstance(module, RowParallelLinear):
+                    hidden_states_out = module(hidden_states_out.contiguous())
+                else:
+                    hidden_states_out = module(hidden_states_out)
 
             # Split back to text and image
             encoder_hidden_states_out = hidden_states_out[:, :text_seq_length, :]
@@ -663,6 +703,7 @@ def __init__(
         approximate: str = "none",
         bias: bool = True,
         quant_config: "QuantizationConfig | None" = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.proj = ColumnParallelLinear(
@@ -672,6 +713,7 @@ def __init__(
             gather_output=False,
             return_bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.proj",
         )
         self.approximate = approximate
 
@@ -688,6 +730,7 @@ def __init__(
         *,
         bias: bool = True,
         quant_config: "QuantizationConfig | None" = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.proj = ColumnParallelLinear(
@@ -697,6 +740,7 @@ def __init__(
             gather_output=False,
             return_bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.proj",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -714,6 +758,7 @@ def __init__(
         bias: bool = True,
         activation_fn: str = "gelu",
         quant_config: "QuantizationConfig | None" = None,
+        prefix: str = "",
     ):
         super().__init__()
         inner_dim = inner_dim or int(dim * mult)
@@ -721,7 +766,7 @@ def __init__(
 
         if activation_fn == "linear-silu":
             layers: list[nn.Module] = [
-                ColumnParallelSiLU(dim, inner_dim, bias=bias, quant_config=quant_config),
+                ColumnParallelSiLU(dim, inner_dim, bias=bias, quant_config=quant_config, prefix=f"{prefix}.net.0"),
                 nn.Identity(),
                 RowParallelLinear(
                     inner_dim,
@@ -730,12 +775,20 @@ def __init__(
                     input_is_parallel=True,
                     return_bias=False,
                     quant_config=quant_config,
+                    prefix=f"{prefix}.net.2",
                 ),
             ]
         else:
             approximate = "tanh" if activation_fn == "gelu-approximate" else "none"
             layers = [
-                ColumnParallelGELU(dim, inner_dim, approximate=approximate, bias=bias, quant_config=quant_config),
+                ColumnParallelGELU(
+                    dim,
+                    inner_dim,
+                    approximate=approximate,
+                    bias=bias,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.net.0",
+                ),
                 nn.Identity(),
                 RowParallelLinear(
                     inner_dim,
@@ -744,6 +797,7 @@ def __init__(
                     input_is_parallel=True,
                     return_bias=False,
                     quant_config=quant_config,
+                    prefix=f"{prefix}.net.2",
                 ),
             ]
 
@@ -751,7 +805,12 @@ def __init__(
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         for module in self.net:
-            hidden_states = module(hidden_states)
+            if isinstance(module, ColumnParallelLinear):
+                hidden_states, _ = module(hidden_states)
+            elif isinstance(module, RowParallelLinear):
+                hidden_states = module(hidden_states.contiguous())
+            else:
+                hidden_states = module(hidden_states)
         return hidden_states
 
 
@@ -765,19 +824,21 @@ def __init__(
         attention_head_dim: int = 40,
         time_embed_dim: int = 512,
         ffn_hidden_dim: int | None = None,
-        quant_config: "QuantizationConfig | None" = None,
         parallel_config: DiffusionParallelConfig | None = None,
+        quant_config: "QuantizationConfig | None" = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
         # 1. Attention with AdaLN
-        self.norm1 = GlmImageAdaLayerNormZero(time_embed_dim, dim)
+        self.norm1 = GlmImageAdaLayerNormZero(time_embed_dim, dim, quant_config=quant_config, prefix=f"{prefix}.norm1")
         self.attn1 = GlmImageAttention(
             dim=dim,
             num_heads=num_attention_heads,
             head_dim=attention_head_dim,
-            quant_config=quant_config,
             parallel_config=parallel_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn1",
         )
 
         # 2. Feedforward
@@ -789,6 +850,7 @@ def __init__(
             inner_dim=ffn_hidden_dim,
             activation_fn="gelu-approximate",
             quant_config=quant_config,
+            prefix=f"{prefix}.ff",
         )
 
     def forward(
@@ -959,6 +1021,7 @@ def __init__(
             inner_dim=inner_dim,
             activation_fn="gelu",
             quant_config=quant_config,
+            prefix="glyph_projector",
         )
         self.prior_token_embedding = nn.Embedding(prior_vq_quantizer_codebook_size, inner_dim)
         self.prior_projector = GlmImageFeedForward(
@@ -967,6 +1030,7 @@ def __init__(
             inner_dim=inner_dim,
             activation_fn="linear-silu",
             quant_config=quant_config,
+            prefix="prior_projector",
         )
 
         # Prepare module for SP (encapsulates patch embedding and RoPE for _sp_plan)
@@ -988,15 +1052,20 @@ def __init__(
                     attention_head_dim,
                     time_embed_dim,
                     ffn_hidden_dim=ffn_hidden_dim,
-                    quant_config=quant_config,
                     parallel_config=self.parallel_config,
+                    quant_config=quant_config,
+                    prefix=f"transformer_blocks.{i}",
                 )
-                for _ in range(num_layers)
+                for i in range(num_layers)
             ]
         )
 
         # 4. Output projection
-        self.norm_out = GlmImageAdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False)
+        # Final modulation feeds proj_out; quant_config is NOT applied here
+        # to avoid precision degradation in the final projection layer.
+        self.norm_out = GlmImageAdaLayerNormContinuous(
+            inner_dim, time_embed_dim, elementwise_affine=False, quant_config=None, prefix="norm_out"
+        )
         self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True)
 
     def forward(
diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
index 97cba18c234..f5b7d608abf 100644
--- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
+++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py
@@ -301,10 +301,7 @@ def __init__(
 
         # Load transformer (DiT)
         logger.info("Loading GlmImageTransformer2DModel (DiT)...")
-        self.transformer = GlmImageTransformer2DModel(
-            od_config=od_config,
-            quant_config=od_config.quantization_config,
-        )
+        self.transformer = GlmImageTransformer2DModel(od_config=od_config, quant_config=od_config.quantization_config)
 
         # Weight sources for DiT loading
         self.weights_sources = [