diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py index 4205a8b2d1ac..d7430821d7ae 100644 --- a/tests/models/multimodal/generation/test_keye.py +++ b/tests/models/multimodal/generation/test_keye.py @@ -24,12 +24,8 @@ class ModelRequestData(NamedTuple): sampling_params: SamplingParams | None = None -@pytest.mark.core_model @pytest.mark.parametrize("question", [QUESTION]) -def test_keye_vl( - image_assets, - question: str, -): +def test_keye_vl(image_assets, question: str): images = [asset.pil_image for asset in image_assets] image_urls = [encode_image_url(image) for image in images] diff --git a/tests/models/multimodal/generation/test_nemotron_parse.py b/tests/models/multimodal/generation/test_nemotron_parse.py index 1b05d336c10b..e224f31e6df9 100644 --- a/tests/models/multimodal/generation/test_nemotron_parse.py +++ b/tests/models/multimodal/generation/test_nemotron_parse.py @@ -1,21 +1,53 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Sequence +from collections.abc import Iterable, Sequence import pytest +import regex as re from transformers import AutoModel from tests.models.utils import check_logprobs_close from vllm.assets.image import ImageAsset +from vllm.logprobs import Logprob, SampleLogprobs +from vllm.tokenizers import TokenizerLike from ....conftest import HfRunner, PromptImageInput, VllmRunner -from ....utils import create_new_process_for_each_test IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB") PROMPT = "" +class DummyLogprobs(dict[int, Logprob]): + def __init__(self, vocab_ids: Iterable[int]): + super().__init__(dict.fromkeys(vocab_ids, Logprob(0.0))) + + def __repr__(self): + return "DummyLogprobs()" + + +def mask_bbox_tokens( + output: tuple[list[int], str, SampleLogprobs], + tokenizer: TokenizerLike, +) -> tuple[list[int], str, SampleLogprobs]: + """ + Always pass check_logprobs_close check for bounding box tokens + because it is reasonable for them to differ slightly. + """ + ignore_pattern = r"<[xy]_[\d.]+>" + vocab = tokenizer.get_vocab() + + output_ids, output_str, out_logprobs = output + + masked_logprobs = list[dict[int, Logprob]]() + for token, logprobs in zip(output_ids, out_logprobs): + if re.match(ignore_pattern, tokenizer.decode(token)): + masked_logprobs.append(DummyLogprobs(vocab.values())) + else: + masked_logprobs.append(logprobs) + + return output_ids, output_str, masked_logprobs + + def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], @@ -44,6 +76,8 @@ def run_test( for prompts, images in inputs ] + tokenizer = vllm_model.llm.get_tokenizer() + with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: hf_outputs_per_case = [ hf_model.generate_greedy_logprobs_limit( @@ -58,18 +92,20 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case): check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, + outputs_0_lst=[ + mask_bbox_tokens(output, tokenizer) for output in hf_outputs + ], + outputs_1_lst=[ + mask_bbox_tokens(output, tokenizer) for output in vllm_outputs + ], name_0="hf", name_1="vllm", ) -@pytest.mark.core_model @pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"]) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("num_logprobs", [5]) -@create_new_process_for_each_test("spawn") def test_models( hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int ) -> None: @@ -77,10 +113,7 @@ def test_models( hf_runner, vllm_runner, inputs=[ - ( - [PROMPT] * 10, - [IMAGE] * 10, - ), + ([PROMPT] * 10, [IMAGE] * 10), ], model=model, dtype=dtype, diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py index 0de505b05e48..71125dbe94f8 100644 --- a/tests/models/test_terratorch.py +++ b/tests/models/test_terratorch.py @@ -8,7 +8,7 @@ from tests.utils import create_new_process_for_each_test -@create_new_process_for_each_test() # Memory is not cleaned up properly otherwise +@create_new_process_for_each_test() # Hangs otherwise @pytest.mark.parametrize( "model", [ diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py index c99c8800d80b..f4837185f7d7 100644 --- a/vllm/model_executor/models/nemotron_parse.py +++ b/vllm/model_executor/models/nemotron_parse.py @@ -319,8 +319,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), - (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"), - (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), + # MergedColumnParallelLinear uses integer indices (0, 1) + (".encoder_attn.kv_proj", ".encoder_attn.k_proj", 0), + (".encoder_attn.kv_proj", ".encoder_attn.v_proj", 1), ] params_dict = dict(self.named_parameters()) loaded_params: set[str] = set()