Skip to content

Commit a18bfa0

Browse files
bigshanedoggepwalsh
authored andcommitted
[MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (vllm-project#20931)
Signed-off-by: bigshanedogg <[email protected]>
1 parent 1784929 commit a18bfa0

File tree

7 files changed

+1365
-0
lines changed

7 files changed

+1365
-0
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,7 @@ th {
365365
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
366366
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
367367
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
368+
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
368369
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
369370
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
370371
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |

examples/offline_inference/vision_language.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
316316
)
317317

318318

319+
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
320+
def run_hyperclovax_seed_vision(
321+
questions: list[str], modality: str
322+
) -> ModelRequestData:
323+
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
324+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
325+
326+
engine_args = EngineArgs(
327+
model=model_name,
328+
trust_remote_code=True,
329+
max_model_len=8192 if modality == "image" else 16384,
330+
limit_mm_per_prompt={modality: 1},
331+
)
332+
333+
messages = list()
334+
for question in questions:
335+
if modality == "image":
336+
"""
337+
ocr: List the words in the image in raster order.
338+
Even if the word order feels unnatural for reading,
339+
the model will handle it as long as it follows raster order.
340+
e.g. "Naver, CLOVA, bigshane"
341+
lens_keywords: List the entity names in the image.
342+
e.g. "iPhone"
343+
lens_local_keywords: List the entity names with quads in the image.
344+
e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
345+
"""
346+
messages.append(
347+
[
348+
{
349+
"role": "user",
350+
"content": [
351+
{
352+
"type": "image",
353+
"ocr": "",
354+
"lens_keywords": "",
355+
"lens_local_keywords": "",
356+
},
357+
{
358+
"type": "text",
359+
"text": question,
360+
},
361+
],
362+
}
363+
]
364+
)
365+
elif modality == "video":
366+
messages.append(
367+
[
368+
{
369+
"role": "user",
370+
"content": [
371+
{
372+
"type": "video",
373+
},
374+
{
375+
"type": "text",
376+
"text": question,
377+
},
378+
],
379+
}
380+
]
381+
)
382+
else:
383+
raise ValueError(f"Unsupported modality: {modality}")
384+
385+
prompts = tokenizer.apply_chat_template(
386+
messages,
387+
tokenize=False,
388+
add_generation_prompt=True,
389+
)
390+
391+
return ModelRequestData(
392+
engine_args=engine_args,
393+
prompts=prompts,
394+
stop_token_ids=None,
395+
)
396+
397+
319398
# Idefics3-8B-Llama3
320399
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
321400
assert modality == "image"
@@ -1222,6 +1301,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
12221301
"glm4v": run_glm4v,
12231302
"glm4_1v": run_glm4_1v,
12241303
"h2ovl_chat": run_h2ovl,
1304+
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
12251305
"idefics3": run_idefics3,
12261306
"internvl_chat": run_internvl,
12271307
"nemotron_vl": run_nemotron_vl,

examples/offline_inference/vision_language_multi_image.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
289289
)
290290

291291

292+
def load_hyperclovax_seed_vision(
293+
question: str, image_urls: list[str]
294+
) -> ModelRequestData:
295+
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
296+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
297+
298+
engine_args = EngineArgs(
299+
model=model_name,
300+
trust_remote_code=True,
301+
max_model_len=16384,
302+
limit_mm_per_prompt={"image": len(image_urls)},
303+
)
304+
305+
message = {"role": "user", "content": list()}
306+
for _image_url in image_urls:
307+
message["content"].append(
308+
{
309+
"type": "image",
310+
"image": _image_url,
311+
"ocr": "",
312+
"lens_keywords": "",
313+
"lens_local_keywords": "",
314+
}
315+
)
316+
message["content"].append(
317+
{
318+
"type": "text",
319+
"text": question,
320+
}
321+
)
322+
323+
prompt = tokenizer.apply_chat_template(
324+
[
325+
message,
326+
],
327+
tokenize=False,
328+
add_generation_prompt=True,
329+
)
330+
331+
return ModelRequestData(
332+
engine_args=engine_args,
333+
prompt=prompt,
334+
stop_token_ids=None,
335+
image_data=[fetch_image(url) for url in image_urls],
336+
)
337+
338+
292339
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
293340
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
294341
# it will generate poor response for multi-image inputs!
@@ -900,6 +947,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
900947
"h2ovl_chat": load_h2ovl,
901948
"idefics3": load_idefics3,
902949
"internvl_chat": load_internvl,
950+
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
903951
"keye_vl": load_keye_vl,
904952
"kimi_vl": load_kimi_vl,
905953
"llava": load_llava,

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ def _test_processing_correctness_one(
278278
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
279279
"moonshotai/Kimi-VL-A3B-Instruct",
280280
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
281+
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
281282
"llava-hf/llava-1.5-7b-hf",
282283
"llava-hf/llava-v1.6-mistral-7b-hf",
283284
"llava-hf/LLaVA-NeXT-Video-7B-hf",

tests/models/registry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ def check_available_online(
201201
trust_remote_code=True),
202202
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
203203
trust_remote_code=True),
204+
"HCXVisionForCausalLM": _HfExamplesInfo(
205+
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
206+
trust_remote_code=True),
204207
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
205208
trust_remote_code=True),
206209
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",

0 commit comments

Comments
 (0)