vllm-project · DarkLight1337 · Mar 10, 2026 · Jan 10, 2026 · Jan 10, 2026 · Jan 17, 2026
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup> | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
 | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
+| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
 | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |

@@ -118,7 +118,7 @@ async def test_multi_chunk_streaming(
             # JIT compilation
             warmup_done = False
             while not warmup_done:
-                event = await receive_event(ws, timeout=360.0)
+                event = await receive_event(ws, timeout=600.0)
                 if event["type"] in ("transcription.done", "error"):
                     warmup_done = True
 

@@ -1458,6 +1458,38 @@ def test_parse_chat_messages_context_text_format(
     assert mm_uuids is None
 
 
+def test_parse_chat_messages_openai_format_image_url(
+    phi3v_model_config,
+    image_url,
+):
+    content = [
+        {"type": "image_url", "image_url": {"url": image_url}},
+        {"type": "text", "text": "What's in the image?"},
+    ]
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": content,
+            }
+        ],
+        phi3v_model_config,
+        content_format="openai",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "What's in the image?"},
+            ],
+        }
+    ]
+    _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
+
+
 def test_parse_chat_messages_rejects_too_many_images_in_one_message(
     phi3v_model_config,
     image_url,

@@ -313,6 +313,10 @@ def check_available_online(
     "HunYuanMoEV1ForCausalLM": _HfExamplesInfo(
         "tencent/Hunyuan-A13B-Instruct", trust_remote_code=True
     ),
+    "HyperCLOVAXForCausalLM": _HfExamplesInfo(
+        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
+        trust_remote_code=True,
+    ),
     "InternLMForCausalLM": _HfExamplesInfo(
         "internlm/internlm-chat-7b", trust_remote_code=True
     ),
@@ -793,6 +797,10 @@ def check_available_online(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True,
     ),
+    "HCXVisionV2ForCausalLM": _HfExamplesInfo(
+        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
+        trust_remote_code=True,
+    ),
     "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
         "tencent/HunyuanOCR",
         hf_overrides={"num_experts": 0},

@@ -1428,6 +1428,8 @@ def _parse_chat_message_content_part(
     with multimodal placeholders.
     """
     if isinstance(part, str):  # Handle plain text parts
+        if wrap_dicts:
+            return {"type": "text", "text": part}
         return part
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
@@ -1487,11 +1489,9 @@ def _parse_chat_message_content_part(
     else:
         raise NotImplementedError(f"Unknown part type: {part_type}")
 
-    return (
-        {"type": modality}
-        if wrap_dicts
-        else (MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None)
-    )
+    if wrap_dicts:
+        return {"type": modality}
+    return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
 
 
 # No need to validate using Pydantic again

diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
@@ -325,14 +325,16 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
+        fields = dict(
             pixel_values_images=MultiModalFieldConfig.batched("image"),
             image_sizes_images=MultiModalFieldConfig.batched("image"),
             vision_query_lengths_images=MultiModalFieldConfig.batched("image"),
             pixel_values_videos=MultiModalFieldConfig.batched("video"),
             vision_query_lengths_videos=MultiModalFieldConfig.batched("video"),
         )
 
+        return fields
+
 
 def _build_hcxvision_hf_info(
     ctx: InputProcessingContext,
@@ -590,12 +592,26 @@ def build_mlp(
     dummy_inputs=HCXVisionDummyInputsBuilder,
 )
 class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    """
+    HyperCLOVAX-SEED Vision-Language Model (V1 architecture).
+
+    Supports:
+    - HyperCLOVAX-SEED-Vision-Instruct-3B
+
+    Uses CLIP/SigLIP as the vision encoder with C-Abstractor projector.
+    """
+
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
 
         # init configs
@@ -647,8 +663,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.vision_config = vision_config
         self.text_config = text_config
 
-        # use_sum_loss = bool(kwargs.pop("use_sum_loss", False))
-        # self.reduction = self._init_reduction_type(use_sum_loss)
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None: