Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
b63d430
try fix with mm_processor_kwargs
JaredforReal Mar 27, 2026
0ca4ba2
fix small h/w calculate
JaredforReal Mar 27, 2026
aba40ff
fix parse_mm_data
JaredforReal Mar 27, 2026
4ef4240
back up
JaredforReal Mar 27, 2026
23445c1
fix in images/ endpoints
JaredforReal Mar 27, 2026
96c82da
update max_tokens
JaredforReal Mar 30, 2026
5f11b57
log
JaredforReal Mar 31, 2026
3ea8392
log
JaredforReal Mar 31, 2026
5a2110f
trace back
JaredforReal Mar 31, 2026
bbec1e1
fix in serving_chat
JaredforReal Mar 31, 2026
ae356cd
try in apply_main
JaredforReal Mar 31, 2026
64ead74
fix para
JaredforReal Mar 31, 2026
3de0259
rewrite text only
JaredforReal Mar 31, 2026
c9b46e3
revert serving chat
JaredforReal Mar 31, 2026
bc8d738
serving chat
JaredforReal Mar 31, 2026
8bfbe5a
fix max_tokens
JaredforReal Mar 31, 2026
3841145
debug
JaredforReal Mar 31, 2026
392f070
fix wrong default max_tokens
JaredforReal Mar 31, 2026
5c990db
return t2i early
JaredforReal Mar 31, 2026
5f92bb0
log
JaredforReal Mar 31, 2026
54cc3e0
calc image_grid_thw manully
JaredforReal Apr 1, 2026
73105ee
robust i2i mrope positions
JaredforReal Apr 2, 2026
36bdfa0
fix t2i offline sp
JaredforReal Apr 2, 2026
cb93180
i2i max tokens compute
JaredforReal Apr 2, 2026
6f3ccdf
i2i detect
JaredforReal Apr 2, 2026
d2da3c0
i2i processor
JaredforReal Apr 2, 2026
a2db045
serving chat i2i
JaredforReal Apr 2, 2026
abdbfc3
simplify end2end and serveing_chat
JaredforReal Apr 2, 2026
ce5e3ef
simplify glm_image stage input processor
JaredforReal Apr 2, 2026
c08c550
simplify glm_image stage input processor
JaredforReal Apr 2, 2026
774da2b
Merge branch 'main' into fix/glm
JaredforReal Apr 2, 2026
3c7f8b5
pre-commit serving chat
JaredforReal Apr 2, 2026
9a5184f
refactor i2i processor
JaredforReal Apr 2, 2026
d4ffa1d
fix up for i2i online serving
JaredforReal Apr 2, 2026
4ce632c
accept some reviews
JaredforReal Apr 7, 2026
8991bf4
Merge branch 'main' into fix/glm
JaredforReal Apr 7, 2026
993d93d
accept more review
JaredforReal Apr 7, 2026
c42d33e
Merge branch 'main' into fix/glm
JaredforReal Apr 8, 2026
530f4f0
add unit tests for glm image processor, stage input processor, servin…
JaredforReal Apr 8, 2026
3ad7fc5
rename test file
JaredforReal Apr 8, 2026
83aab2c
update tests
JaredforReal Apr 8, 2026
f5c8c3d
update tests
JaredforReal Apr 8, 2026
9f713a2
Merge branch 'main' into fix/glm
JaredforReal Apr 10, 2026
9cecbba
Merge branch 'main' into fix/glm
hsliuustc0106 Apr 10, 2026
5a54b5c
Merge branch 'main' into fix/glm
hsliuustc0106 Apr 11, 2026
b176c57
update tests
JaredforReal Apr 13, 2026
e972341
get rid of transformers
JaredforReal Apr 13, 2026
f5865bd
fix unexpected consequence
JaredforReal Apr 16, 2026
1191d3a
add mm_uuids to _process_text
JaredforReal Apr 16, 2026
25704e7
add mm_uuids
JaredforReal Apr 16, 2026
d4c9360
Merge branch 'main' into fix/glm
JaredforReal Apr 16, 2026
5008cda
add more mm_uuids
JaredforReal Apr 16, 2026
9b81cb6
profile ar2dit
JaredforReal Apr 20, 2026
9ba8ec1
Merge branch 'main' into fix/glm
JaredforReal Apr 20, 2026
200983f
fix simple unit test
JaredforReal Apr 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 26 additions & 12 deletions examples/offline_inference/glm_image/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,34 +57,38 @@
GLM_IMAGE_VISION_VOCAB_SIZE = 16512 # top_k should be vision_vocab_size


def compute_max_tokens(height: int, width: int, factor: int = 32) -> int:
def compute_max_tokens(height: int, width: int, factor: int = 32, is_i2i: bool = False) -> int:
"""
Compute max_new_tokens for GLM-Image AR generation.

GLM-Image generates tokens in this order for text-to-image:
1. Small preview image (half resolution in each dimension)
2. Large target image (full resolution)
3. EOS token
GLM-Image generation differs by mode:
- text-to-image (t2i): small preview + large target + EOS
- image-to-image (i2i): large target + EOS

Args:
height: Target image height in pixels
width: Target image width in pixels
factor: Downsampling factor (32 for GLM-Image AR output)
is_i2i: Whether the request is image-to-image mode

Returns:
Total number of tokens to generate (small + large + EOS)
Total number of tokens to generate for the specified mode
"""
# Large image tokens (target resolution)
token_h = height // factor
token_w = width // factor
large_tokens = token_h * token_w

# Small preview tokens (half resolution in each dimension)
small_h = token_h // 2
small_w = token_w // 2
small_tokens = small_h * small_w
import math

# Total: small + large + EOS
ratio = token_h / token_w if token_w > 0 else 1.0
small_token_h = max(1, int(math.sqrt(ratio) * (factor // 2)))
small_token_w = max(1, int(math.sqrt(1 / ratio) * (factor // 2)))
small_tokens = small_token_h * small_token_w

if is_i2i:
return large_tokens + 1
return small_tokens + large_tokens + 1


Expand Down Expand Up @@ -282,14 +286,18 @@ def main(args: argparse.Namespace) -> None:
# Compute max_tokens dynamically based on target image size
target_height = prompt_dict.get("height", 1024)
target_width = prompt_dict.get("width", 1024)
calculated_max_tokens = compute_max_tokens(target_height, target_width)
is_i2i = source_image is not None
calculated_max_tokens = compute_max_tokens(target_height, target_width, is_i2i=is_i2i)

# Use calculated value unless user explicitly specified a different value
# Default args.max_tokens is 16384 (very large), so prefer calculated value
effective_max_tokens = calculated_max_tokens if args.max_tokens == 16384 else args.max_tokens

if args.verbose:
print(f"AR max_tokens: {effective_max_tokens} (calculated: {calculated_max_tokens}, arg: {args.max_tokens})")
print(
f"AR max_tokens: {effective_max_tokens} "
f"(calculated: {calculated_max_tokens}, arg: {args.max_tokens}, mode: {'i2i' if is_i2i else 't2i'})"
)

# IMPORTANT: GLM-Image AR model requires these exact sampling parameters
# from generation_config.json for proper image token generation.
Expand All @@ -303,6 +311,12 @@ def main(args: argparse.Namespace) -> None:
stop_token_ids=[GLM_IMAGE_EOS_TOKEN_ID], # 16385, CRITICAL for stopping
seed=args.seed,
detokenize=False,
# Keep target size available in runner/model for deterministic M-RoPE
# decode grids in t2i (no mm_features available in this path).
extra_args={
"target_h": int(target_height),
"target_w": int(target_width),
},
)

# For diffusion stage, sampling_params contains diffusion-specific parameters
Expand Down
186 changes: 186 additions & 0 deletions tests/entrypoints/openai_api/test_serving_chat_sampling_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ def mock_request(mocker: MockerFixture):
request.stop_token_ids = None
request.frequency_penalty = None
request.presence_penalty = None
# Must be real Python objects (not MagicMock) so the code's explicit-field
# and extra_body checks work correctly.
request.model_fields_set = set()
request.extra_body = {}
return request


Expand Down Expand Up @@ -150,6 +154,7 @@ def test_preserves_yaml_defaults_when_no_request_params(serving_chat, mock_reque
def test_request_temperature_overrides_yaml_default(serving_chat, mock_request):
"""Test that request temperature overrides YAML default."""
mock_request.temperature = 0.8
mock_request.model_fields_set = {"temperature"}

result = serving_chat._build_sampling_params_list_from_request(mock_request)

Expand All @@ -162,6 +167,7 @@ def test_request_temperature_overrides_yaml_default(serving_chat, mock_request):
def test_request_top_p_overrides_yaml_default(serving_chat, mock_request):
"""Test that request top_p overrides YAML default."""
mock_request.top_p = 0.95
mock_request.model_fields_set = {"top_p"}

result = serving_chat._build_sampling_params_list_from_request(mock_request)

Expand All @@ -173,6 +179,7 @@ def test_request_top_p_overrides_yaml_default(serving_chat, mock_request):
def test_request_max_tokens_overrides_yaml_default(serving_chat, mock_request):
"""Test that request max_tokens overrides YAML default."""
mock_request.max_tokens = 100
mock_request.model_fields_set = {"max_tokens"}

result = serving_chat._build_sampling_params_list_from_request(mock_request)

Expand All @@ -189,6 +196,7 @@ def test_max_tokens_uses_yaml_default_when_not_specified(serving_chat, mock_requ
def test_request_seed_overrides_yaml_default(serving_chat, mock_request):
"""Test that request seed overrides YAML default."""
mock_request.seed = 123
mock_request.model_fields_set = {"seed"}

result = serving_chat._build_sampling_params_list_from_request(mock_request)

Expand All @@ -200,6 +208,7 @@ def test_request_seed_overrides_yaml_default(serving_chat, mock_request):
def test_request_frequency_penalty_overrides(serving_chat, mock_request):
"""Test that request frequency_penalty is applied."""
mock_request.frequency_penalty = 0.5
mock_request.model_fields_set = {"frequency_penalty"}

result = serving_chat._build_sampling_params_list_from_request(mock_request)

Expand All @@ -209,6 +218,7 @@ def test_request_frequency_penalty_overrides(serving_chat, mock_request):
def test_request_presence_penalty_overrides(serving_chat, mock_request):
"""Test that request presence_penalty is applied."""
mock_request.presence_penalty = 0.3
mock_request.model_fields_set = {"presence_penalty"}

result = serving_chat._build_sampling_params_list_from_request(mock_request)

Expand All @@ -235,6 +245,7 @@ def test_multiple_params_override_together(serving_chat, mock_request):
mock_request.temperature = 0.7
mock_request.top_p = 0.85
mock_request.seed = 999
mock_request.model_fields_set = {"max_tokens", "temperature", "top_p", "seed"}

result = serving_chat._build_sampling_params_list_from_request(mock_request)

Expand Down Expand Up @@ -275,6 +286,7 @@ def test_apply_request_overrides_applies_values(serving_chat, mock_request, defa
"""Test that _apply_request_overrides applies non-None request values."""
mock_request.temperature = 0.8
mock_request.seed = 123
mock_request.model_fields_set = {"temperature", "seed"}

result = serving_chat._apply_request_overrides(default_comprehension_params, mock_request)

Expand Down Expand Up @@ -304,6 +316,8 @@ def test_apply_overrides_empty_stop_list_preserves_default(serving_chat, mocker)
request.stop_token_ids = None
request.frequency_penalty = None
request.presence_penalty = None
request.model_fields_set = {"stop"}
request.extra_body = {}

result = serving_chat._apply_request_overrides(default_params, request)

Expand All @@ -325,6 +339,8 @@ def test_apply_overrides_nonempty_stop_list_overrides_default(serving_chat, mock
request.stop_token_ids = None
request.frequency_penalty = None
request.presence_penalty = None
request.model_fields_set = {"stop"}
request.extra_body = {}

result = serving_chat._apply_request_overrides(default_params, request)

Expand Down Expand Up @@ -367,6 +383,8 @@ def test_apply_overrides_nonempty_stop_token_ids_overrides_default(serving_chat,
request.stop_token_ids = [100] # non-empty list — should override
request.frequency_penalty = None
request.presence_penalty = None
request.model_fields_set = {"stop_token_ids"}
request.extra_body = {}

result = serving_chat._apply_request_overrides(default_params, request)

Expand All @@ -392,6 +410,8 @@ def test_apply_overrides_mixed_empty_and_nonempty_lists(serving_chat, mocker):
request.stop_token_ids = [100, 200] # non-empty — SHOULD override
request.frequency_penalty = None
request.presence_penalty = None
request.model_fields_set = {"temperature", "stop", "stop_token_ids"}
request.extra_body = {}

result = serving_chat._apply_request_overrides(default_params, request)

Expand All @@ -415,6 +435,8 @@ def test_apply_overrides_none_scalar_still_preserves_default(serving_chat, mocke
request.stop_token_ids = None
request.frequency_penalty = None
request.presence_penalty = None
request.model_fields_set = set()
request.extra_body = {}

result = serving_chat._apply_request_overrides(default_params, request)

Expand Down Expand Up @@ -442,6 +464,8 @@ def test_apply_overrides_both_lists_empty_preserves_defaults(serving_chat, mocke
request.stop_token_ids = []
request.frequency_penalty = None
request.presence_penalty = None
request.model_fields_set = {"stop", "stop_token_ids"}
request.extra_body = {}

result = serving_chat._apply_request_overrides(default_params, request)

Expand Down Expand Up @@ -511,3 +535,165 @@ def test_get_comprehension_stage_index_raises_when_not_found(mocker: MockerFixtu

with pytest.raises(ValueError, match="No comprehension stage"):
instance._get_comprehension_stage_index()


# =============================================================================
# Tests for _resolve_height_width_from_extra_body
# =============================================================================


class TestResolveHeightWidth:
def test_explicit_height_width(self):
from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat

h, w = OmniOpenAIServingChat._resolve_height_width_from_extra_body({"height": 512, "width": 768})
assert h == 512
assert w == 768

def test_size_string(self):
from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat

h, w = OmniOpenAIServingChat._resolve_height_width_from_extra_body({"size": "768x512"})
assert w == 768
assert h == 512

def test_size_string_uppercase(self):
from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat

h, w = OmniOpenAIServingChat._resolve_height_width_from_extra_body({"size": "768X512"})
assert w == 768
assert h == 512

def test_size_fallback_when_height_missing(self):
from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat

h, w = OmniOpenAIServingChat._resolve_height_width_from_extra_body({"size": "512x512", "width": 1024})
# height is None -> size fallback fires and sets BOTH width and height
assert h == 512
assert w == 512

def test_empty_extra_body(self):
from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat

h, w = OmniOpenAIServingChat._resolve_height_width_from_extra_body({})
assert h is None
assert w is None

def test_invalid_size_format_ignored(self):
from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat

h, w = OmniOpenAIServingChat._resolve_height_width_from_extra_body({"size": "invalid"})
assert h is None
assert w is None


# =============================================================================
# Tests for _apply_request_overrides with GLM-Image (max_tokens computation)
# =============================================================================


class TestApplyRequestOverridesGLMImage:
"""Test dynamic max_tokens computation for GLM-Image AR stage."""

@pytest.fixture
def glm_serving_chat(self, mock_engine_client, mocker: MockerFixture):
from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat

instance = object.__new__(OmniOpenAIServingChat)
instance.engine_client = mock_engine_client
# Mock the image extraction to return no reference images (t2i by default)
instance._extract_diffusion_prompt_and_images_from_messages = mocker.MagicMock(return_value=("a cat", []))
return instance

@pytest.fixture
def glm_request(self, mocker: MockerFixture):
req = mocker.MagicMock()
req.temperature = None
req.top_p = None
req.top_k = None
req.max_tokens = None
req.min_tokens = None
req.seed = None
req.ignore_eos = None
req.stop = None
req.stop_token_ids = None
req.frequency_penalty = None
req.presence_penalty = None
req.extra_body = {"height": 1024, "width": 1024}
req.model_fields_set = set()
return req

def test_t2i_computes_max_tokens(self, glm_serving_chat, glm_request, default_comprehension_params):
"""t2i mode: max_tokens computed from height/width, no reference images."""
result = glm_serving_chat._apply_request_overrides(default_comprehension_params, glm_request)
# t2i 1024x1024 = 256 + 1024 + 1 = 1281
assert result.max_tokens == 1281
assert result.extra_args["target_h"] == 1024
assert result.extra_args["target_w"] == 1024

def test_i2i_computes_fewer_tokens(
self, glm_serving_chat, glm_request, default_comprehension_params, mocker: MockerFixture
):
"""i2i mode: max_tokens should be smaller than t2i for same dimensions."""
# Make it detect reference images
glm_serving_chat._extract_diffusion_prompt_and_images_from_messages = mocker.MagicMock(
return_value=("edit this", ["fake_image"])
)

result = glm_serving_chat._apply_request_overrides(default_comprehension_params, glm_request)
# i2i 1024x1024 = 1024 + 1 = 1025
assert result.max_tokens == 1025

def test_dynamic_max_tokens_overrides_user_value(self, glm_serving_chat, glm_request, default_comprehension_params):
"""When height/width are provided, dynamic computation overrides user max_tokens."""
glm_request.max_tokens = 500
glm_request.model_fields_set = {"max_tokens"}

result = glm_serving_chat._apply_request_overrides(default_comprehension_params, glm_request)
# Dynamic computation from height/width always wins when present
assert result.max_tokens == 1281

def test_no_height_width_preserves_default(
self, glm_serving_chat, mocker: MockerFixture, default_comprehension_params
):
"""When no height/width in extra_body, keep YAML default max_tokens."""
req = mocker.MagicMock()
req.temperature = None
req.top_p = None
req.top_k = None
req.max_tokens = None
req.min_tokens = None
req.seed = None
req.ignore_eos = None
req.stop = None
req.stop_token_ids = None
req.frequency_penalty = None
req.presence_penalty = None
req.extra_body = {}
req.model_fields_set = set()

result = glm_serving_chat._apply_request_overrides(default_comprehension_params, req)
assert result.max_tokens == 2048 # YAML default

def test_size_string_parsed_for_glm_image(
self, glm_serving_chat, mocker: MockerFixture, default_comprehension_params
):
"""'size' in extra_body is parsed as fallback for height/width."""
req = mocker.MagicMock()
req.temperature = None
req.top_p = None
req.top_k = None
req.max_tokens = None
req.min_tokens = None
req.seed = None
req.ignore_eos = None
req.stop = None
req.stop_token_ids = None
req.frequency_penalty = None
req.presence_penalty = None
req.extra_body = {"size": "512x512"}
req.model_fields_set = set()

result = glm_serving_chat._apply_request_overrides(default_comprehension_params, req)
# 512x512 t2i = 256 + 256 + 1 = 513
assert result.max_tokens == 513
Loading
Loading