diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index ae93d2353a3..c77727e216c 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -608,6 +608,7 @@ steps: timeout_in_minutes: 60 commands: - pytest -sv tests/e2e/ -k "not test_wan and not test_bagel_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy + - pytest -sv tests/e2e/offline_inference/test_glm_image_autoround_w4a16_expansion.py -m "full_model and diffusion and L4" --run-level "full_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: diff --git a/docs/user_guide/quantization/autoround.md b/docs/user_guide/quantization/autoround.md index 88fed3b62b3..3afc8b06233 100644 --- a/docs/user_guide/quantization/autoround.md +++ b/docs/user_guide/quantization/autoround.md @@ -52,8 +52,8 @@ vLLM-Omni's runtime module names. | Model | Scope | Status | Notes | |-------|-------|--------|-------| +| GLM-Image | Diffusion transformer | ✅ | `Intel/GLM-Image-int4-AutoRound` | | BAGEL | Checkpoint-defined diffusion or transformer stage | Not validated | Requires a compatible AutoRound checkpoint | -| GLM-Image | Checkpoint-defined diffusion or transformer stage | Not validated | Requires a compatible AutoRound checkpoint | ## Configuration diff --git a/tests/diffusion/models/glm_image/test_glm_image_quantization.py b/tests/diffusion/models/glm_image/test_glm_image_quantization.py new file mode 100644 index 00000000000..853150af78f --- /dev/null +++ b/tests/diffusion/models/glm_image/test_glm_image_quantization.py @@ -0,0 +1,699 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for GLM-Image quantization support (W4A16/AutoRound). + +These tests verify that the GLM-Image DiT transformer correctly accepts and uses +quantization configs for W4A16/AutoRound quantization support. +""" + +import pytest +import torch +from pytest_mock import MockerFixture + +from vllm_omni.diffusion.data import DiffusionParallelConfig +from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + ColumnParallelGELU, + ColumnParallelSiLU, + GlmImageAdaLayerNormContinuous, + GlmImageAdaLayerNormZero, + GlmImageAttention, + GlmImageFeedForward, + GlmImageImageProjector, + GlmImagePrepare, + GlmImageRotaryPosEmbed, + GlmImageTransformer2DModel, + GlmImageTransformerBlock, + _positive_divisors, + validate_glm_image_tp_constraints, +) +from vllm_omni.model_executor.models.glm_image.pipeline import GLM_IMAGE_PIPELINE + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture(scope="function", autouse=True) +def setup_mocks(mocker: MockerFixture): + """Set up common mocks for all tests.""" + mocker.patch( + "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", + return_value=1, + ) + mock_get_tp_group = mocker.patch("vllm.distributed.parallel_state.get_tp_group") + mock_tp_group = mocker.MagicMock() + mock_tp_group.world_size = 1 + mock_get_tp_group.return_value = mock_tp_group + yield + + +class TestPositiveDivisors: + """Test _positive_divisors helper function.""" + + def test_divisors_of_1(self): + assert _positive_divisors(1) == {1} + + def test_divisors_of_12(self): + assert _positive_divisors(12) == {1, 2, 3, 4, 6, 12} + + def test_divisors_of_prime(self): + assert _positive_divisors(7) == {1, 7} + + def test_divisors_of_0_returns_empty(self): + assert _positive_divisors(0) == set() + + def test_divisors_of_negative_returns_empty(self): + assert _positive_divisors(-5) == set() + + +class TestValidateGlmImageTpConstraints: + """Test TP constraint validation for GLM-Image.""" + + def test_valid_tp_size_1(self): + """TP=1 should always be valid.""" + result = validate_glm_image_tp_constraints( + dim=2560, + num_heads=64, + ffn_hidden_dim=10240, + tensor_parallel_size=1, + ) + assert 1 in result + + def test_valid_tp_size_2_for_divisible_dim(self): + """TP=2 is valid when all dims are divisible by 2.""" + result = validate_glm_image_tp_constraints( + dim=2560, + num_heads=64, + ffn_hidden_dim=10240, + tensor_parallel_size=2, + ) + assert 1 in result + assert 2 in result + + def test_valid_tp_size_4_for_divisible_dim(self): + """TP=4 is valid when all dims are divisible by 4.""" + result = validate_glm_image_tp_constraints( + dim=2560, + num_heads=64, + ffn_hidden_dim=10240, + tensor_parallel_size=4, + ) + assert 1 in result + assert 2 in result + assert 4 in result + + def test_invalid_tp_size_3_for_divisible_dim(self): + """TP=3 is invalid when dim is not divisible by 3.""" + with pytest.raises(ValueError, match="dim % tensor_parallel_size == 0"): + validate_glm_image_tp_constraints( + dim=2560, # 2560 % 3 != 0 + num_heads=64, + ffn_hidden_dim=10240, + tensor_parallel_size=3, + ) + + def test_invalid_tp_size_zero(self): + """TP=0 should raise error.""" + with pytest.raises(ValueError, match="tensor_parallel_size must be > 0"): + validate_glm_image_tp_constraints( + dim=2560, + num_heads=64, + ffn_hidden_dim=10240, + tensor_parallel_size=0, + ) + + def test_invalid_tp_size_negative(self): + """Negative TP size should raise error.""" + with pytest.raises(ValueError, match="tensor_parallel_size must be > 0"): + validate_glm_image_tp_constraints( + dim=2560, + num_heads=64, + ffn_hidden_dim=10240, + tensor_parallel_size=-1, + ) + + +class TestGlmImageAdaLayerNormZeroQuantization: + """Test GlmImageAdaLayerNormZero with quantization config.""" + + def test_accepts_quant_config_parameter(self, mocker: MockerFixture): + """Verify the class accepts quant_config parameter.""" + mock_quant_config = mocker.MagicMock() + layer = GlmImageAdaLayerNormZero( + embedding_dim=512, + dim=2560, + quant_config=mock_quant_config, + prefix="test.norm1", + ) + assert layer.linear.quant_config is mock_quant_config + + def test_accepts_none_quant_config(self): + """Verify quant_config=None is accepted.""" + layer = GlmImageAdaLayerNormZero( + embedding_dim=512, + dim=2560, + quant_config=None, + ) + assert layer.linear.quant_config is None + + def test_forward_handles_tuple_return_from_linear(self): + """Verify forward handles tuple returns from ReplicatedLinear.""" + layer = GlmImageAdaLayerNormZero( + embedding_dim=512, + dim=2560, + quant_config=None, + ) + + batch_size = 2 + seq_len = 10 + hidden_states = torch.randn(batch_size, seq_len, 2560) + encoder_hidden_states = torch.randn(batch_size, seq_len, 2560) + temb = torch.randn(batch_size, 512) + + # This should work regardless of whether linear returns tuple or tensor + result = layer(hidden_states, encoder_hidden_states, temb) + assert len(result) == 10 # Should return 10 chunks + + +class TestGlmImageAdaLayerNormContinuousQuantization: + """Test GlmImageAdaLayerNormContinuous with quantization config.""" + + def test_accepts_quant_config_parameter(self, mocker: MockerFixture): + """Verify the class accepts quant_config parameter.""" + mock_quant_config = mocker.MagicMock() + layer = GlmImageAdaLayerNormContinuous( + embedding_dim=2560, + conditioning_embedding_dim=512, + quant_config=mock_quant_config, + prefix="test.norm_out", + ) + assert layer.linear.quant_config is mock_quant_config + + def test_accepts_none_quant_config(self): + """Verify quant_config=None is accepted.""" + layer = GlmImageAdaLayerNormContinuous( + embedding_dim=2560, + conditioning_embedding_dim=512, + quant_config=None, + ) + assert layer.linear.quant_config is None + + def test_forward_handles_tuple_return_from_linear(self): + """Verify forward handles tuple returns from ReplicatedLinear.""" + layer = GlmImageAdaLayerNormContinuous( + embedding_dim=2560, + conditioning_embedding_dim=512, + quant_config=None, + ) + + batch_size = 2 + seq_len = 10 + x = torch.randn(batch_size, seq_len, 2560) + conditioning_embedding = torch.randn(batch_size, 512) + + result = layer(x, conditioning_embedding) + assert result.shape == (batch_size, seq_len, 2560) + + +class TestGlmImageAttentionQuantization: + """Test GlmImageAttention with quantization config.""" + + def test_accepts_quant_config_parameter(self, mocker: MockerFixture): + """Verify GlmImageAttention accepts quant_config parameter.""" + mock_quant_config = mocker.MagicMock() + attn = GlmImageAttention( + dim=2560, + num_heads=64, + head_dim=40, + quant_config=mock_quant_config, + prefix="test.attn1", + ) + assert attn.to_qkv.quant_config is mock_quant_config + + def test_accepts_none_quant_config(self): + """Verify quant_config=None is accepted.""" + attn = GlmImageAttention( + dim=2560, + num_heads=64, + head_dim=40, + quant_config=None, + ) + assert attn.to_qkv.quant_config is None + + +class TestColumnParallelModulesQuantization: + """Test ColumnParallelGELU and ColumnParallelSiLU with quantization config.""" + + def test_column_parallel_gelu_accepts_quant_config(self, mocker: MockerFixture): + """Verify ColumnParallelGELU accepts quant_config.""" + mock_quant_config = mocker.MagicMock() + layer = ColumnParallelGELU( + dim_in=2560, + dim_out=10240, + quant_config=mock_quant_config, + prefix="test.gelu", + ) + assert layer.proj.quant_config is mock_quant_config + + def test_column_parallel_silu_accepts_quant_config(self, mocker: MockerFixture): + """Verify ColumnParallelSiLU accepts quant_config.""" + mock_quant_config = mocker.MagicMock() + layer = ColumnParallelSiLU( + dim_in=2560, + dim_out=10240, + quant_config=mock_quant_config, + prefix="test.silu", + ) + assert layer.proj.quant_config is mock_quant_config + + +class TestGlmImageFeedForwardQuantization: + """Test GlmImageFeedForward with quantization config.""" + + def test_accepts_quant_config_parameter(self, mocker: MockerFixture): + """Verify GlmImageFeedForward accepts quant_config parameter.""" + mock_quant_config = mocker.MagicMock() + ff = GlmImageFeedForward( + dim=2560, + dim_out=2560, + inner_dim=10240, + activation_fn="gelu-approximate", + quant_config=mock_quant_config, + prefix="test.ff", + ) + # Check that the first layer (ColumnParallelGELU) has quant_config + gelu_layer = ff.net[0] + assert gelu_layer.proj.quant_config is mock_quant_config + + def test_accepts_none_quant_config(self): + """Verify quant_config=None is accepted.""" + ff = GlmImageFeedForward( + dim=2560, + dim_out=2560, + inner_dim=10240, + activation_fn="gelu-approximate", + quant_config=None, + ) + gelu_layer = ff.net[0] + assert gelu_layer.proj.quant_config is None + + def test_linear_silu_activation(self): + """Test linear-silu activation function initialization works.""" + # This test verifies the module can be instantiated with linear-silu activation. + # Full forward testing requires proper TP group setup, so we just verify construction. + ff = GlmImageFeedForward( + dim=2560, + dim_out=2560, + inner_dim=10240, + activation_fn="linear-silu", + quant_config=None, + ) + # Verify the FFN has the correct structure + assert len(ff.net) == 3 # ColumnParallelSiLU, Identity, RowParallelLinear + assert isinstance(ff.net[0], ColumnParallelSiLU) + + +class TestGlmImageTransformerBlockQuantization: + """Test GlmImageTransformerBlock with quantization config.""" + + def test_accepts_quant_config_parameter(self, mocker: MockerFixture): + """Verify GlmImageTransformerBlock accepts quant_config parameter.""" + mock_quant_config = mocker.MagicMock() + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=1, + sequence_parallel_size=1, + ) + block = GlmImageTransformerBlock( + dim=2560, + num_attention_heads=64, + attention_head_dim=40, + time_embed_dim=512, + parallel_config=parallel_config, + quant_config=mock_quant_config, + prefix="test.block", + ) + # Check that inner modules have quant_config + assert block.norm1.linear.quant_config is mock_quant_config + assert block.attn1.to_qkv.quant_config is mock_quant_config + assert block.ff.net[0].proj.quant_config is mock_quant_config + + def test_accepts_none_quant_config(self): + """Verify quant_config=None is accepted.""" + block = GlmImageTransformerBlock( + dim=2560, + num_attention_heads=64, + attention_head_dim=40, + time_embed_dim=512, + quant_config=None, + ) + assert block.norm1.linear.quant_config is None + + +class TestGlmImagePrepareModule: + """Test GlmImagePrepare module.""" + + def test_prepare_module_exists(self): + """Verify GlmImagePrepare module exists and works.""" + projector = GlmImageImageProjector(in_channels=16, hidden_size=2560, patch_size=2) + rope = GlmImageRotaryPosEmbed(dim=40, patch_size=2) + prepare = GlmImagePrepare( + image_projector=projector, + rope=rope, + patch_size=2, + ) + + # Test forward pass + hidden_states = torch.randn(1, 16, 64, 64) # [B, C, H, W] + result = prepare(hidden_states) + + assert len(result) == 5 + hidden_out, rope_cos, rope_sin, height, width = result + assert hidden_out.shape[0] == 1 # batch size + assert hidden_out.shape[1] == 1024 # seq_len (32 * 32) + assert hidden_out.shape[2] == 2560 # hidden_dim + + +class TestGlmImagePipelineConfig: + """Test GLM_IMAGE_PIPELINE configuration.""" + + def test_glm_image_pipeline_config_exists(self): + """Verify GLM_IMAGE_PIPELINE config exists.""" + assert GLM_IMAGE_PIPELINE is not None + + def test_pipeline_has_correct_model_type(self): + """Verify pipeline has correct model_type.""" + assert GLM_IMAGE_PIPELINE.model_type == "glm_image" + + def test_pipeline_has_correct_model_arch(self): + """Verify pipeline has correct model_arch.""" + assert GLM_IMAGE_PIPELINE.model_arch == "GlmImageForConditionalGeneration" + + def test_pipeline_has_two_stages(self): + """Verify pipeline has two stages (AR + DiT).""" + assert len(GLM_IMAGE_PIPELINE.stages) == 2 + + def test_stage_0_is_ar_llm(self): + """Verify stage 0 is the AR LLM stage.""" + stage_0 = GLM_IMAGE_PIPELINE.stages[0] + assert stage_0.stage_id == 0 + assert stage_0.model_stage == "ar" + assert stage_0.execution_type.value == "llm_ar" + assert stage_0.owns_tokenizer is True + + def test_stage_1_is_diffusion(self): + """Verify stage 1 is the DiT diffusion stage.""" + stage_1 = GLM_IMAGE_PIPELINE.stages[1] + assert stage_1.stage_id == 1 + assert stage_1.model_stage == "dit" + assert stage_1.execution_type.value == "diffusion" + assert stage_1.final_output is True + assert stage_1.final_output_type == "image" + assert stage_1.input_sources == (0,) # Takes input from stage 0 + + def test_pipeline_has_diffusers_class_name(self): + """Verify pipeline has diffusers_class_name.""" + assert GLM_IMAGE_PIPELINE.diffusers_class_name == "GlmImagePipeline" + + +class TestGlmImageImageProjector: + """Test GlmImageImageProjector module.""" + + def test_projector_output_shape(self): + """Verify projector produces correct output shape.""" + projector = GlmImageImageProjector( + in_channels=16, + hidden_size=2560, + patch_size=2, + ) + + # Input: [B, C, H, W] = [1, 16, 64, 64] + hidden_states = torch.randn(1, 16, 64, 64) + output = projector(hidden_states) + + # After patchify: [B, H/2*W/2, D] = [1, 32*32, 2560] = [1, 1024, 2560] + assert output.shape == (1, 1024, 2560) + + def test_projector_with_different_patch_sizes(self): + """Verify projector works with different patch sizes.""" + for patch_size in [2, 4]: + projector = GlmImageImageProjector( + in_channels=16, + hidden_size=2560, + patch_size=patch_size, + ) + h, w = 64, 64 + hidden_states = torch.randn(1, 16, h, w) + output = projector(hidden_states) + expected_seq_len = (h // patch_size) * (w // patch_size) + assert output.shape[1] == expected_seq_len + + +class TestGlmImageRotaryPosEmbed: + """Test GlmImageRotaryPosEmbed module.""" + + def test_rope_output_shape(self): + """Verify RoPE produces correct output shape.""" + rope = GlmImageRotaryPosEmbed(dim=40, patch_size=2, theta=10000.0) + hidden_states = torch.randn(1, 16, 64, 64) # [B, C, H, W] + + cos, sin = rope(hidden_states) + + # After patchify: height=32, width=32 + # Output: [32*32, dim] = [1024, 40] + assert cos.shape[0] == 1024 + assert cos.shape[1] == 40 + assert sin.shape == cos.shape + + def test_rope_value_range(self): + """Verify RoPE produces values in valid range.""" + rope = GlmImageRotaryPosEmbed(dim=40, patch_size=2, theta=10000.0) + hidden_states = torch.randn(2, 16, 32, 32) + + cos, sin = rope(hidden_states) + + # cos and sin should be in [-1, 1] + assert cos.min() >= -1.0 and cos.max() <= 1.0 + assert sin.min() >= -1.0 and sin.max() <= 1.0 + + def test_rope_consistency(self): + """Verify RoPE is consistent for same input.""" + rope = GlmImageRotaryPosEmbed(dim=40, patch_size=2, theta=10000.0) + hidden_states = torch.randn(1, 16, 32, 32) + + cos1, sin1 = rope(hidden_states) + cos2, sin2 = rope(hidden_states) + + # Same input should produce same output + assert torch.allclose(cos1, cos2) + assert torch.allclose(sin1, sin2) + + +class TestGlmImageTransformer2DModelQuantization: + """Test GlmImageTransformer2DModel with quantization config.""" + + def test_accepts_quant_config_parameter(self, mocker: MockerFixture): + """Verify the model accepts quant_config parameter.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + mock_quant_config = mocker.MagicMock() + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=1, + sequence_parallel_size=1, + ) + + # Create a minimal mock od_config + mock_tf_config = mocker.MagicMock() + mock_tf_config.patch_size = 2 + mock_tf_config.in_channels = 16 + mock_tf_config.out_channels = 16 + mock_tf_config.num_attention_heads = 64 + mock_tf_config.attention_head_dim = 40 + mock_tf_config.time_embed_dim = 512 + mock_tf_config.condition_dim = 256 + mock_tf_config.prior_vq_quantizer_codebook_size = 16384 + mock_tf_config.text_embed_dim = 1024 + mock_tf_config.num_layers = 2 # Small number for testing + + mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig) + mock_od_config.tf_model_config = mock_tf_config + mock_od_config.parallel_config = parallel_config + + model = GlmImageTransformer2DModel( + od_config=mock_od_config, + quant_config=mock_quant_config, + ) + + # Check that quantization config was passed to transformer blocks + for block in model.transformer_blocks: + assert block.norm1.linear.quant_config is mock_quant_config + assert block.attn1.to_qkv.quant_config is mock_quant_config + assert block.ff.net[0].proj.quant_config is mock_quant_config + + def test_accepts_none_quant_config(self, mocker: MockerFixture): + """Verify quant_config=None is accepted.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=1, + sequence_parallel_size=1, + ) + + mock_tf_config = mocker.MagicMock() + mock_tf_config.patch_size = 2 + mock_tf_config.in_channels = 16 + mock_tf_config.out_channels = 16 + mock_tf_config.num_attention_heads = 64 + mock_tf_config.attention_head_dim = 40 + mock_tf_config.time_embed_dim = 512 + mock_tf_config.condition_dim = 256 + mock_tf_config.prior_vq_quantizer_codebook_size = 16384 + mock_tf_config.text_embed_dim = 1024 + mock_tf_config.num_layers = 2 + + mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig) + mock_od_config.tf_model_config = mock_tf_config + mock_od_config.parallel_config = parallel_config + + model = GlmImageTransformer2DModel( + od_config=mock_od_config, + quant_config=None, + ) + + # Check that quantization config is None + for block in model.transformer_blocks: + assert block.norm1.linear.quant_config is None + assert block.attn1.to_qkv.quant_config is None + + def test_norm_out_has_no_quantization(self, mocker: MockerFixture): + """Verify norm_out (output layer) does NOT use quantization to preserve precision.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + mock_quant_config = mocker.MagicMock() + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=1, + sequence_parallel_size=1, + ) + + mock_tf_config = mocker.MagicMock() + mock_tf_config.patch_size = 2 + mock_tf_config.in_channels = 16 + mock_tf_config.out_channels = 16 + mock_tf_config.num_attention_heads = 64 + mock_tf_config.attention_head_dim = 40 + mock_tf_config.time_embed_dim = 512 + mock_tf_config.condition_dim = 256 + mock_tf_config.prior_vq_quantizer_codebook_size = 16384 + mock_tf_config.text_embed_dim = 1024 + mock_tf_config.num_layers = 2 + + mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig) + mock_od_config.tf_model_config = mock_tf_config + mock_od_config.parallel_config = parallel_config + + model = GlmImageTransformer2DModel( + od_config=mock_od_config, + quant_config=mock_quant_config, + ) + + # norm_out.linear should NOT have quant_config to preserve output precision + assert model.norm_out.linear.quant_config is None + + def test_model_has_sp_plan(self, mocker: MockerFixture): + """Verify model has _sp_plan defined for sequence parallelism.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=1, + sequence_parallel_size=1, + ) + + mock_tf_config = mocker.MagicMock() + mock_tf_config.patch_size = 2 + mock_tf_config.in_channels = 16 + mock_tf_config.out_channels = 16 + mock_tf_config.num_attention_heads = 64 + mock_tf_config.attention_head_dim = 40 + mock_tf_config.time_embed_dim = 512 + mock_tf_config.condition_dim = 256 + mock_tf_config.prior_vq_quantizer_codebook_size = 16384 + mock_tf_config.text_embed_dim = 1024 + mock_tf_config.num_layers = 2 + + mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig) + mock_od_config.tf_model_config = mock_tf_config + mock_od_config.parallel_config = parallel_config + + model = GlmImageTransformer2DModel( + od_config=mock_od_config, + quant_config=None, + ) + + # Verify _sp_plan exists + assert hasattr(model, "_sp_plan") + assert "prepare" in model._sp_plan + assert "proj_out" in model._sp_plan + + def test_model_has_hsdp_shard_conditions(self, mocker: MockerFixture): + """Verify model has _hsdp_shard_conditions for HSDP parallelism.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=1, + sequence_parallel_size=1, + ) + + mock_tf_config = mocker.MagicMock() + mock_tf_config.patch_size = 2 + mock_tf_config.in_channels = 16 + mock_tf_config.out_channels = 16 + mock_tf_config.num_attention_heads = 64 + mock_tf_config.attention_head_dim = 40 + mock_tf_config.time_embed_dim = 512 + mock_tf_config.condition_dim = 256 + mock_tf_config.prior_vq_quantizer_codebook_size = 16384 + mock_tf_config.text_embed_dim = 1024 + mock_tf_config.num_layers = 2 + + mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig) + mock_od_config.tf_model_config = mock_tf_config + mock_od_config.parallel_config = parallel_config + + model = GlmImageTransformer2DModel( + od_config=mock_od_config, + quant_config=None, + ) + + assert hasattr(model, "_hsdp_shard_conditions") + assert len(model._hsdp_shard_conditions) > 0 + + def test_model_creates_kv_cache(self, mocker: MockerFixture): + """Verify model can create KV cache for image editing.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=1, + sequence_parallel_size=1, + ) + + mock_tf_config = mocker.MagicMock() + mock_tf_config.patch_size = 2 + mock_tf_config.in_channels = 16 + mock_tf_config.out_channels = 16 + mock_tf_config.num_attention_heads = 64 + mock_tf_config.attention_head_dim = 40 + mock_tf_config.time_embed_dim = 512 + mock_tf_config.condition_dim = 256 + mock_tf_config.prior_vq_quantizer_codebook_size = 16384 + mock_tf_config.text_embed_dim = 1024 + mock_tf_config.num_layers = 2 + + mock_od_config = mocker.MagicMock(spec=OmniDiffusionConfig) + mock_od_config.tf_model_config = mock_tf_config + mock_od_config.parallel_config = parallel_config + + model = GlmImageTransformer2DModel( + od_config=mock_od_config, + quant_config=None, + ) + + kv_cache = model.create_kv_cache() + assert kv_cache is not None + assert len(kv_cache) == 2 # num_layers diff --git a/tests/e2e/offline_inference/test_glm_image_autoround_w4a16_expansion.py b/tests/e2e/offline_inference/test_glm_image_autoround_w4a16_expansion.py new file mode 100644 index 00000000000..2ea4dc64154 --- /dev/null +++ b/tests/e2e/offline_inference/test_glm_image_autoround_w4a16_expansion.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""E2E tests for GLM-Image AutoRound W4A16 quantized inference. + +These tests cover text-to-image and image-to-image generation with +the W4A16 quantized GLM-Image model. + +Requirements: + - 2 CUDA GPUs (H100 or equivalent) + - The quantized model checkpoint (Intel/GLM-Image-int4-AutoRound) +""" + +import gc +import math +import os + +import numpy as np +import pytest +from PIL import Image +from vllm import SamplingParams + +from tests.helpers.env import DeviceMemoryMonitor +from tests.helpers.mark import hardware_test +from tests.helpers.media import generate_synthetic_image +from tests.helpers.runtime import OmniRunnerHandler +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + +QUANTIZED_MODEL = os.environ.get("GLM_IMAGE_AUTOROUND_MODEL", "Intel/GLM-Image-int4-AutoRound") + +# Small resolution to keep GPU memory & time manageable +HEIGHT = 256 +WIDTH = 256 +NUM_STEPS = 2 # minimal for smoke-test + +# GLM-Image AR generation config (from generation_config.json) +GLM_IMAGE_EOS_TOKEN_ID = 16385 +GLM_IMAGE_VISION_VOCAB_SIZE = 16512 + +_CI_DEPLOY = get_deploy_config_path("glm_image.yaml") + + +def _get_stage_config(): + """Build a CI-friendly stage config with eager mode for testing.""" + return modify_stage_config( + _CI_DEPLOY, + updates={ + "stages": { + 0: {"enforce_eager": True}, + 1: {"enforce_eager": True}, + }, + }, + ) + + +stage_config = _get_stage_config() + +# (model, stage_config_path) for ``omni_runner`` indirect parametrize +_OMNI_RUNNER_PARAM = (QUANTIZED_MODEL, stage_config) + + +def compute_max_tokens(height: int, width: int, factor: int = 32) -> int: + """Compute max_new_tokens for GLM-Image AR text-to-image generation.""" + token_h = height // factor + token_w = width // factor + large_tokens = token_h * token_w + + ratio = token_h / token_w if token_w > 0 else 1.0 + small_token_h = max(1, int(math.sqrt(ratio) * (factor // 2))) + small_token_w = max(1, int(math.sqrt(1 / ratio) * (factor // 2))) + small_tokens = small_token_h * small_token_w + + return small_tokens + large_tokens + 1 + + +def _ar_sampling_params(max_tokens: int, height: int, width: int, seed: int = 42) -> SamplingParams: + """Build AR stage SamplingParams for GLM-Image.""" + return SamplingParams( + temperature=0.9, + top_p=0.75, + top_k=GLM_IMAGE_VISION_VOCAB_SIZE, + max_tokens=max_tokens, + stop_token_ids=[GLM_IMAGE_EOS_TOKEN_ID], + seed=seed, + detokenize=False, + extra_args={ + "target_h": height, + "target_w": width, + }, + ) + + +def _diffusion_sampling_params( + height: int = HEIGHT, + width: int = WIDTH, + num_steps: int = NUM_STEPS, + seed: int = 42, +) -> OmniDiffusionSamplingParams: + """Build Diffusion stage OmniDiffusionSamplingParams.""" + return OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=num_steps, + guidance_scale=0.0, + seed=seed, + ) + + +pytestmark = [ + pytest.mark.full_model, + pytest.mark.diffusion, +] + + +# ------------------------------------------------------------------ +# Test: text-to-image generation produces a valid image (quantized) +# ------------------------------------------------------------------ + + +@pytest.mark.parametrize("omni_runner", [_OMNI_RUNNER_PARAM], indirect=True) +@hardware_test(res={"cuda": "H100"}, num_cards=2) +def test_glm_image_autoround_w4a16_generates_image(omni_runner_handler: OmniRunnerHandler): + """Load the W4A16 quantized GLM-Image model and verify it produces a valid image.""" + gc.collect() + current_omni_platform.empty_cache() + device_index = current_omni_platform.current_device() + current_omni_platform.reset_peak_memory_stats() + monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) + monitor.start() + + prompt_dict = { + "prompt": "A photo of a cat sitting on a laptop keyboard", + "height": HEIGHT, + "width": WIDTH, + "mm_processor_kwargs": { + "target_h": HEIGHT, + "target_w": WIDTH, + }, + } + ar_params = _ar_sampling_params( + max_tokens=compute_max_tokens(HEIGHT, WIDTH), + height=HEIGHT, + width=WIDTH, + seed=42, + ) + diffusion_params = _diffusion_sampling_params( + height=HEIGHT, + width=WIDTH, + num_steps=NUM_STEPS, + seed=42, + ) + + outputs = omni_runner_handler.runner.generate( + [prompt_dict], + [ar_params, diffusion_params], + ) + + monitor.stop() + + first_output = outputs[0] + assert first_output.final_output_type == "image" + req_out = first_output.request_output + assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") + images = req_out.images + + assert len(images) >= 1, "Expected at least one generated image" + img = images[0] + assert isinstance(img, Image.Image) + assert img.width == WIDTH, f"Expected width {WIDTH}, got {img.width}" + assert img.height == HEIGHT, f"Expected height {HEIGHT}, got {img.height}" + + # Sanity: image should not be blank (all-zero) + arr = np.array(img) + assert arr.std() > 1.0, "Generated image appears blank (std ≈ 0)" + + gc.collect() + current_omni_platform.empty_cache() + + +# ------------------------------------------------------------------ +# Test: image-to-image generation (quantized) +# ------------------------------------------------------------------ + + +@pytest.mark.parametrize("omni_runner", [_OMNI_RUNNER_PARAM], indirect=True) +@hardware_test(res={"cuda": "H100"}, num_cards=2) +def test_glm_image_autoround_w4a16_image_to_image(omni_runner_handler: OmniRunnerHandler): + """Load the W4A16 quantized GLM-Image and verify image-to-image generation works.""" + ref_image_arr = generate_synthetic_image(WIDTH, HEIGHT)["np_array"] + + gc.collect() + current_omni_platform.empty_cache() + current_omni_platform.reset_peak_memory_stats() + + prompt_dict = { + "prompt": "Make it look like winter", + "multi_modal_data": {"image": ref_image_arr}, + "height": HEIGHT, + "width": WIDTH, + "mm_processor_kwargs": { + "target_h": HEIGHT, + "target_w": WIDTH, + }, + } + ar_params = _ar_sampling_params( + max_tokens=compute_max_tokens(HEIGHT, WIDTH), + height=HEIGHT, + width=WIDTH, + seed=42, + ) + diffusion_params = _diffusion_sampling_params( + height=HEIGHT, + width=WIDTH, + num_steps=NUM_STEPS, + seed=42, + ) + + outputs = omni_runner_handler.runner.generate( + [prompt_dict], + [ar_params, diffusion_params], + ) + + first_output = outputs[0] + assert first_output.final_output_type == "image" + req_out = first_output.request_output + assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") + images = req_out.images + + assert len(images) >= 1, "Expected at least one generated image" + img = images[0] + assert isinstance(img, Image.Image) + assert img.width == WIDTH + assert img.height == HEIGHT + + gc.collect() + current_omni_platform.empty_cache() diff --git a/tests/model_executor/models/glm_image/test_glm_image_init.py b/tests/model_executor/models/glm_image/test_glm_image_init.py new file mode 100644 index 00000000000..eb42e9649fe --- /dev/null +++ b/tests/model_executor/models/glm_image/test_glm_image_init.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for GLM-Image model __init__.py lazy import pattern. + +The ``__init__.py`` uses ``__getattr__`` for lazy loading to avoid importing +``transformers.models.glm_image`` at module init, which may not be available +in all environments. +""" + +import pytest + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class TestLazyImport: + """Test the __getattr__ lazy import pattern in glm_image __init__.py.""" + + def test_getattr_exists_and_is_callable(self): + """Verify __getattr__ exists and is callable.""" + import vllm_omni.model_executor.models.glm_image as glm_image_pkg + + assert hasattr(glm_image_pkg, "__getattr__") + assert callable(glm_image_pkg.__getattr__) + + def test_getattr_returns_class_for_known_attribute(self): + """Verify __getattr__ returns GlmImageForConditionalGeneration for known attribute.""" + import vllm_omni.model_executor.models.glm_image as glm_image_pkg + + # Call __getattr__ directly to test the lazy import logic + result = glm_image_pkg.__getattr__("GlmImageForConditionalGeneration") + + # Verify we get a class (the actual GlmImageForConditionalGeneration) + assert result is not None + assert isinstance(result, type) + assert result.__name__ == "GlmImageForConditionalGeneration" + + def test_getattr_raises_for_unknown_attribute(self): + """Verify __getattr__ raises AttributeError for unknown attributes.""" + import vllm_omni.model_executor.models.glm_image as glm_image_pkg + + # Test unknown attribute via __getattr__ directly + with pytest.raises(AttributeError, match="has no attribute"): + glm_image_pkg.__getattr__("UnknownClass") + + def test___all___exports_correct_symbols(self): + """Verify __all__ contains the expected exported symbols.""" + import vllm_omni.model_executor.models.glm_image as glm_image_pkg + + assert hasattr(glm_image_pkg, "__all__") + assert "GlmImageForConditionalGeneration" in glm_image_pkg.__all__ + + +class TestLazyImportDoesNotImportTransformersAtInit: + """Verify that importing the package does not eagerly load transformers.""" + + def test_glm_image_module_has_getattr(self): + """Test that the module has __getattr__ for lazy loading.""" + import vllm_omni.model_executor.models.glm_image as glm_image_pkg + + # The module should have __getattr__ + assert hasattr(glm_image_pkg, "__getattr__") + assert callable(glm_image_pkg.__getattr__) + + # And __all__ + assert hasattr(glm_image_pkg, "__all__") + + def test_module_has_proper_structure(self): + """Test that the module has proper Python module structure.""" + import vllm_omni.model_executor.models.glm_image as glm_image_pkg + + # Should be a module + assert hasattr(glm_image_pkg, "__name__") + assert glm_image_pkg.__name__ == "vllm_omni.model_executor.models.glm_image" + + # Should have __file__ (even if it's a package) + assert hasattr(glm_image_pkg, "__file__") diff --git a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py index 37ea454de20..152286ec220 100644 --- a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py +++ b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py @@ -77,7 +77,7 @@ def load_pipeline(cls, model_path: str, device: str, dtype: torch.dtype) -> Any: os.path.join("transformer", "config.json"), od_config.model, ) - od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) + od_config.set_tf_model_config(TransformerConfig.from_dict(tf_config_dict)) loader = DiffusersPipelineLoader(LoadConfig(), od_config=od_config) # load_model will handle dtypes / device placement, put in .eval() mode diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index ddb32aa2025..56a7066e390 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -33,6 +34,9 @@ ) from vllm_omni.diffusion.forward_context import get_forward_context +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + logger = init_logger(__name__) @@ -230,11 +234,24 @@ def forward( class GlmImageAdaLayerNormZero(nn.Module): """Adaptive LayerNorm with zero initialization for both image and text streams.""" - def __init__(self, embedding_dim: int, dim: int) -> None: + def __init__( + self, + embedding_dim: int, + dim: int, + quant_config: "QuantizationConfig | None" = None, + prefix: str = "", + ) -> None: super().__init__() self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) self.norm_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) - self.linear = nn.Linear(embedding_dim, 12 * dim, bias=True) + self.linear = ReplicatedLinear( + embedding_dim, + 12 * dim, + bias=True, + return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.linear", + ) def forward( self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, temb: torch.Tensor @@ -244,6 +261,8 @@ def forward( norm_encoder_hidden_states = self.norm_context(encoder_hidden_states).to(dtype=dtype) emb = self.linear(temb) + if isinstance(emb, tuple): + emb = emb[0] ( shift_msa, c_shift_msa, @@ -286,14 +305,25 @@ def __init__( elementwise_affine: bool = True, eps: float = 1e-5, bias: bool = True, + quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() - self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias) + self.linear = ReplicatedLinear( + conditioning_embedding_dim, + embedding_dim * 2, + bias=bias, + return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.linear", + ) self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine) def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: # NO SiLU here emb = self.linear(conditioning_embedding.to(x.dtype)) + if isinstance(emb, tuple): + emb = emb[0] scale, shift = torch.chunk(emb, 2, dim=1) x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] return x @@ -469,6 +499,7 @@ def __init__( out_bias: bool = True, eps: float = 1e-5, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() self.dim = dim @@ -485,6 +516,7 @@ def __init__( bias=True, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.to_qkv", ) # QK normalization (LayerNorm, not RMSNorm for GLM-Image) @@ -501,6 +533,7 @@ def __init__( input_is_parallel=True, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.to_out.0", ), nn.Dropout(0.0), ] @@ -554,7 +587,7 @@ def forward( hidden_states_combined = torch.cat([encoder_hidden_states, hidden_states], dim=1) # QKV projection - qkv = self.to_qkv(hidden_states_combined) + qkv = self.to_qkv(hidden_states_combined.contiguous()) q_size = self.to_qkv.num_heads * self.head_dim kv_size = self.to_qkv.num_kv_heads * self.head_dim query, key, value = qkv.split([q_size, kv_size, kv_size], dim=-1) @@ -604,8 +637,12 @@ def forward( # Project combined [text, image] outputs, then split. # This keeps SP numerically aligned with the non-SP path. joint_hidden_states_out = joint_hidden_states_out.flatten(2, 3).to(dtype) + # Contiguous for FP8/W4A16 quantized RowParallelLinear for module in self.to_out: - joint_hidden_states_out = module(joint_hidden_states_out) + if isinstance(module, RowParallelLinear): + joint_hidden_states_out = module(joint_hidden_states_out.contiguous()) + else: + joint_hidden_states_out = module(joint_hidden_states_out) encoder_hidden_states_out = joint_hidden_states_out[:, :text_seq_length, :] hidden_states_out = joint_hidden_states_out[:, text_seq_length:, :] @@ -645,7 +682,10 @@ def forward( # Output projection for module in self.to_out: - hidden_states_out = module(hidden_states_out) + if isinstance(module, RowParallelLinear): + hidden_states_out = module(hidden_states_out.contiguous()) + else: + hidden_states_out = module(hidden_states_out) # Split back to text and image encoder_hidden_states_out = hidden_states_out[:, :text_seq_length, :] @@ -663,6 +703,7 @@ def __init__( approximate: str = "none", bias: bool = True, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() self.proj = ColumnParallelLinear( @@ -672,6 +713,7 @@ def __init__( gather_output=False, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.proj", ) self.approximate = approximate @@ -688,6 +730,7 @@ def __init__( *, bias: bool = True, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() self.proj = ColumnParallelLinear( @@ -697,6 +740,7 @@ def __init__( gather_output=False, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.proj", ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -714,6 +758,7 @@ def __init__( bias: bool = True, activation_fn: str = "gelu", quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() inner_dim = inner_dim or int(dim * mult) @@ -721,7 +766,7 @@ def __init__( if activation_fn == "linear-silu": layers: list[nn.Module] = [ - ColumnParallelSiLU(dim, inner_dim, bias=bias, quant_config=quant_config), + ColumnParallelSiLU(dim, inner_dim, bias=bias, quant_config=quant_config, prefix=f"{prefix}.net.0"), nn.Identity(), RowParallelLinear( inner_dim, @@ -730,12 +775,20 @@ def __init__( input_is_parallel=True, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.net.2", ), ] else: approximate = "tanh" if activation_fn == "gelu-approximate" else "none" layers = [ - ColumnParallelGELU(dim, inner_dim, approximate=approximate, bias=bias, quant_config=quant_config), + ColumnParallelGELU( + dim, + inner_dim, + approximate=approximate, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.net.0", + ), nn.Identity(), RowParallelLinear( inner_dim, @@ -744,6 +797,7 @@ def __init__( input_is_parallel=True, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.net.2", ), ] @@ -751,7 +805,12 @@ def __init__( def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: for module in self.net: - hidden_states = module(hidden_states) + if isinstance(module, ColumnParallelLinear): + hidden_states, _ = module(hidden_states) + elif isinstance(module, RowParallelLinear): + hidden_states = module(hidden_states.contiguous()) + else: + hidden_states = module(hidden_states) return hidden_states @@ -765,19 +824,21 @@ def __init__( attention_head_dim: int = 40, time_embed_dim: int = 512, ffn_hidden_dim: int | None = None, - quant_config: "QuantizationConfig | None" = None, parallel_config: DiffusionParallelConfig | None = None, + quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ) -> None: super().__init__() # 1. Attention with AdaLN - self.norm1 = GlmImageAdaLayerNormZero(time_embed_dim, dim) + self.norm1 = GlmImageAdaLayerNormZero(time_embed_dim, dim, quant_config=quant_config, prefix=f"{prefix}.norm1") self.attn1 = GlmImageAttention( dim=dim, num_heads=num_attention_heads, head_dim=attention_head_dim, - quant_config=quant_config, parallel_config=parallel_config, + quant_config=quant_config, + prefix=f"{prefix}.attn1", ) # 2. Feedforward @@ -789,6 +850,7 @@ def __init__( inner_dim=ffn_hidden_dim, activation_fn="gelu-approximate", quant_config=quant_config, + prefix=f"{prefix}.ff", ) def forward( @@ -959,6 +1021,7 @@ def __init__( inner_dim=inner_dim, activation_fn="gelu", quant_config=quant_config, + prefix="glyph_projector", ) self.prior_token_embedding = nn.Embedding(prior_vq_quantizer_codebook_size, inner_dim) self.prior_projector = GlmImageFeedForward( @@ -967,6 +1030,7 @@ def __init__( inner_dim=inner_dim, activation_fn="linear-silu", quant_config=quant_config, + prefix="prior_projector", ) # Prepare module for SP (encapsulates patch embedding and RoPE for _sp_plan) @@ -988,15 +1052,20 @@ def __init__( attention_head_dim, time_embed_dim, ffn_hidden_dim=ffn_hidden_dim, - quant_config=quant_config, parallel_config=self.parallel_config, + quant_config=quant_config, + prefix=f"transformer_blocks.{i}", ) - for _ in range(num_layers) + for i in range(num_layers) ] ) # 4. Output projection - self.norm_out = GlmImageAdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False) + # Final modulation feeds proj_out; quant_config is NOT applied here + # to avoid precision degradation in the final projection layer. + self.norm_out = GlmImageAdaLayerNormContinuous( + inner_dim, time_embed_dim, elementwise_affine=False, quant_config=None, prefix="norm_out" + ) self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True) def forward( diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 97cba18c234..f5b7d608abf 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -301,10 +301,7 @@ def __init__( # Load transformer (DiT) logger.info("Loading GlmImageTransformer2DModel (DiT)...") - self.transformer = GlmImageTransformer2DModel( - od_config=od_config, - quant_config=od_config.quantization_config, - ) + self.transformer = GlmImageTransformer2DModel(od_config=od_config, quant_config=od_config.quantization_config) # Weight sources for DiT loading self.weights_sources = [