Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
6859629
fix(hyperclovax): support multimodal prompt handling
effortprogrammer Jan 10, 2026
c603df8
fix(hyperclovax): register HyperCLOVAXForCausalLM
effortprogrammer Jan 10, 2026
ac10e8a
fix(hyperclovax): restore V1 HCXVisionForCausalLM class for backward …
effortprogrammer Jan 17, 2026
6fedc9e
feat(hyperclovax): add dedicated V2 vision module
effortprogrammer Feb 19, 2026
88d6efd
refactor(hyperclovax): isolate V1 implementation and reroute V2 registry
effortprogrammer Feb 19, 2026
4a550fd
merge(main): resolve HyperCLOVAX vision split conflicts
effortprogrammer Feb 19, 2026
7fce965
fix(hyperclovax): use multimodal.processing imports for V2 dummy inputs
effortprogrammer Feb 19, 2026
d4ab8eb
fix(hyperclovax): align V2 dummy input builder with multimodal API
effortprogrammer Feb 19, 2026
340d0b5
fix(hyperclovax): drop unsupported multimodal_config arg in V2 vision…
effortprogrammer Feb 19, 2026
729187a
refactor(hyperclovax): address reviewer feedback in model and registry
effortprogrammer Feb 19, 2026
335bc41
docs: add HyperCLOVAX V2 to supported models
effortprogrammer Feb 19, 2026
8b0d8b3
fix(hyperclovax): remove Omni references from V2 scope
effortprogrammer Feb 19, 2026
677e249
fix(hyperclovax): limit V2 text backend handling to hyperclovax
effortprogrammer Feb 19, 2026
8aeb7a6
fix(hyperclovax): restore original text backend condition
effortprogrammer Feb 19, 2026
f1be0ec
Merge branch 'main' into feat/hyperclovax-seed-think-32b
DarkLight1337 Feb 20, 2026
a175d40
Merge branch 'main' into feat/hyperclovax-seed-think-32b
effortprogrammer Mar 8, 2026
f419cda
fix(hyperclovax): align V2 multimodal processor update flow
effortprogrammer Mar 8, 2026
0c25a00
fix(chat): normalize openai media parts to canonical modality types
effortprogrammer Mar 9, 2026
b1036cc
fix(hyperclovax): satisfy registry coverage and dummy prompt type
effortprogrammer Mar 9, 2026
56ac889
fix(registry): map HyperCLOVAX architecture as text model
effortprogrammer Mar 9, 2026
5abbb8c
fix(ci): harden cpu runner tensor replacement and realtime warmup wait
effortprogrammer Mar 9, 2026
dd7227f
Merge origin/main into feat/hyperclovax-seed-think-32b
effortprogrammer Mar 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup> | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_realtime_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ async def test_multi_chunk_streaming(
# JIT compilation
warmup_done = False
while not warmup_done:
event = await receive_event(ws, timeout=360.0)
event = await receive_event(ws, timeout=600.0)
if event["type"] in ("transcription.done", "error"):
warmup_done = True

Expand Down
32 changes: 32 additions & 0 deletions tests/entrypoints/test_chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,6 +1458,38 @@ def test_parse_chat_messages_context_text_format(
assert mm_uuids is None


def test_parse_chat_messages_openai_format_image_url(
phi3v_model_config,
image_url,
):
content = [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "What's in the image?"},
]
conversation, mm_data, mm_uuids = parse_chat_messages(
[
{
"role": "user",
"content": content,
}
],
phi3v_model_config,
content_format="openai",
)

assert conversation == [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What's in the image?"},
],
}
]
_assert_mm_data_is_image_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])


def test_parse_chat_messages_rejects_too_many_images_in_one_message(
phi3v_model_config,
image_url,
Expand Down
8 changes: 8 additions & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,10 @@ def check_available_online(
"HunYuanMoEV1ForCausalLM": _HfExamplesInfo(
"tencent/Hunyuan-A13B-Instruct", trust_remote_code=True
),
"HyperCLOVAXForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
trust_remote_code=True,
),
"InternLMForCausalLM": _HfExamplesInfo(
"internlm/internlm-chat-7b", trust_remote_code=True
),
Expand Down Expand Up @@ -793,6 +797,10 @@ def check_available_online(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True,
),
"HCXVisionV2ForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
trust_remote_code=True,
),
"HunYuanVLForConditionalGeneration": _HfExamplesInfo(
"tencent/HunyuanOCR",
hf_overrides={"num_experts": 0},
Expand Down
10 changes: 5 additions & 5 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1428,6 +1428,8 @@ def _parse_chat_message_content_part(
with multimodal placeholders.
"""
if isinstance(part, str): # Handle plain text parts
if wrap_dicts:
return {"type": "text", "text": part}
return part
# Handle structured dictionary parts
part_type, content = _parse_chat_message_content_mm_part(part)
Expand Down Expand Up @@ -1487,11 +1489,9 @@ def _parse_chat_message_content_part(
else:
raise NotImplementedError(f"Unknown part type: {part_type}")

return (
{"type": modality}
if wrap_dicts
else (MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None)
)
if wrap_dicts:
return {"type": modality}
return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None


# No need to validate using Pydantic again
Expand Down
25 changes: 21 additions & 4 deletions vllm/model_executor/models/hyperclovax_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,14 +325,16 @@ def _get_mm_fields_config(
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(
fields = dict(
pixel_values_images=MultiModalFieldConfig.batched("image"),
image_sizes_images=MultiModalFieldConfig.batched("image"),
vision_query_lengths_images=MultiModalFieldConfig.batched("image"),
pixel_values_videos=MultiModalFieldConfig.batched("video"),
vision_query_lengths_videos=MultiModalFieldConfig.batched("video"),
)

return fields


def _build_hcxvision_hf_info(
ctx: InputProcessingContext,
Expand Down Expand Up @@ -590,12 +592,26 @@ def build_mlp(
dummy_inputs=HCXVisionDummyInputsBuilder,
)
class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
"""
HyperCLOVAX-SEED Vision-Language Model (V1 architecture).

Supports:
- HyperCLOVAX-SEED-Vision-Instruct-3B

Uses CLIP/SigLIP as the vision encoder with C-Abstractor projector.
"""

packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
) -> None:
super().__init__()

# init configs
Expand Down Expand Up @@ -647,8 +663,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
self.vision_config = vision_config
self.text_config = text_config

# use_sum_loss = bool(kwargs.pop("use_sum_loss", False))
# self.reduction = self._init_reduction_type(use_sum_loss)
self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors
)

@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
Expand Down
Loading