From c034791fb1c1d627a83d9f4dd96db26a0deaeec9 Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Tue, 16 Dec 2025 11:47:05 -0800 Subject: [PATCH 01/11] update architectures Signed-off-by: Xingyu Liu --- vllm/config/model.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index f98dc48fe997..de4921f19798 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -489,14 +489,14 @@ def __post_init__( ) self.model_arch_config = self.get_model_arch_config() - architectures = self.architectures + architecture = self.model_arch_config.architectures[0] registry = self.registry - is_generative_model = registry.is_text_generation_model(architectures, self) - is_pooling_model = registry.is_pooling_model(architectures, self) + is_generative_model = registry.is_text_generation_model([architecture], self) + is_pooling_model = registry.is_pooling_model(architecture, self) - self.runner_type = self._get_runner_type(architectures, self.runner) + self.runner_type = self._get_runner_type(architecture, self.runner) self.convert_type = self._get_convert_type( - architectures, self.runner_type, self.convert + architecture, self.runner_type, self.convert ) if self.runner_type == "generate" and not is_generative_model: @@ -516,7 +516,7 @@ def __post_init__( # Note: Initialize these attributes early because transformers fallback # may fail to load dynamic modules in child processes - model_info, arch = registry.inspect_model_cls(architectures, self) + model_info, arch = registry.inspect_model_cls(architecture, self) self._model_info = model_info self._architecture = arch logger.info("Resolved architecture: %s", arch) @@ -748,7 +748,7 @@ def _get_encoder_config(self): def _get_default_runner_type( self, - architectures: list[str], + architecture: str, ) -> RunnerType: registry = self.registry @@ -756,29 +756,28 @@ def _get_default_runner_type( if get_pooling_config(self.model, self.revision): return "pooling" - for arch in architectures: - if arch in registry.get_supported_archs(): - if registry.is_pooling_model(architectures, self): - return "pooling" - if registry.is_text_generation_model(architectures, self): - return "generate" + if architecture in registry.get_supported_archs(): + if registry.is_pooling_model(architecture, self): + return "pooling" + if registry.is_text_generation_model(architecture, self): + return "generate" - match = try_match_architecture_defaults(arch) - if match: - _, (runner_type, _) = match - return runner_type + match = try_match_architecture_defaults(architecture) + if match: + _, (runner_type, _) = match + return runner_type return "generate" def _get_runner_type( self, - architectures: list[str], + architecture: str, runner: RunnerOption, ) -> RunnerType: if runner != "auto": return runner - runner_type = self._get_default_runner_type(architectures) + runner_type = self._get_default_runner_type(architecture) # Don't log the most common case if runner_type != "generate": From 0d71d65aa38040073d945be5b2685d3f44aefdaa Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Fri, 2 Jan 2026 15:52:43 -0800 Subject: [PATCH 02/11] update all architectures to architecture Signed-off-by: Xingyu Liu --- tests/config/base_model_arch_groundtruth.json | 128 ++++++------------ .../config/draft_model_arch_groundtruth.json | 32 ++--- tests/config/test_model_arch_config.py | 4 +- .../language/generation_ppl_test/ppl_utils.py | 4 +- .../pooling_mteb_test/mteb_embed_utils.py | 4 +- .../pooling_mteb_test/mteb_score_utils.py | 4 +- vllm/config/model.py | 47 +++---- vllm/config/model_arch.py | 4 +- vllm/lora/layers/fused_moe.py | 2 +- vllm/model_executor/models/registry.py | 64 ++++----- .../model_executor/models/transformers/moe.py | 2 +- .../model_arch_config_convertor.py | 11 +- 12 files changed, 123 insertions(+), 183 deletions(-) diff --git a/tests/config/base_model_arch_groundtruth.json b/tests/config/base_model_arch_groundtruth.json index 3401198ad7d5..5e49b077828b 100644 --- a/tests/config/base_model_arch_groundtruth.json +++ b/tests/config/base_model_arch_groundtruth.json @@ -1,8 +1,5 @@ { "state-spaces/mamba-130m-hf": { - "architectures": [ - "MambaForCausalLM" - ], "model_type": "mamba", "text_model_type": "mamba", "hidden_size": 768, @@ -14,12 +11,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.float32" + "dtype": "torch.float32", + "architecture": "MambaForCausalLM" }, "mistralai/Mamba-Codestral-7B-v0.1": { - "architectures": [ - "Mamba2ForCausalLM" - ], "model_type": "mamba", "text_model_type": "mamba", "hidden_size": 4096, @@ -31,12 +26,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "Mamba2ForCausalLM" }, "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11": { - "architectures": [ - "Terratorch" - ], "model_type": "timm_wrapper", "text_model_type": "timm_wrapper", "hidden_size": 0, @@ -48,12 +41,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": true, - "dtype": "torch.float32" + "dtype": "torch.float32", + "architecture": "Terratorch" }, "tiiuae/falcon-mamba-7b-instruct": { - "architectures": [ - "FalconMambaForCausalLM" - ], "model_type": "falcon_mamba", "text_model_type": "falcon_mamba", "hidden_size": 4096, @@ -65,12 +56,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "FalconMambaForCausalLM" }, "Zyphra/Zamba2-7B-instruct": { - "architectures": [ - "Zamba2ForCausalLM" - ], "model_type": "zamba2", "text_model_type": "zamba2", "hidden_size": 3584, @@ -82,12 +71,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "Zamba2ForCausalLM" }, "mosaicml/mpt-7b": { - "architectures": [ - "MPTForCausalLM" - ], "model_type": "mpt", "text_model_type": "mpt", "hidden_size": 4096, @@ -99,12 +86,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "MPTForCausalLM" }, "databricks/dbrx-instruct": { - "architectures": [ - "DbrxForCausalLM" - ], "model_type": "dbrx", "text_model_type": "dbrx", "hidden_size": 6144, @@ -116,12 +101,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "DbrxForCausalLM" }, "tiiuae/falcon-7b": { - "architectures": [ - "FalconForCausalLM" - ], "model_type": "falcon", "text_model_type": "falcon", "hidden_size": 4544, @@ -133,12 +116,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "FalconForCausalLM" }, "tiiuae/falcon-40b": { - "architectures": [ - "FalconForCausalLM" - ], "model_type": "falcon", "text_model_type": "falcon", "hidden_size": 8192, @@ -150,12 +131,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "FalconForCausalLM" }, "luccafong/deepseek_mtp_main_random": { - "architectures": [ - "DeepseekV3ForCausalLM" - ], "model_type": "deepseek_v3", "text_model_type": "deepseek_v3", "hidden_size": 2560, @@ -167,12 +146,10 @@ "num_experts": 72, "is_deepseek_mla": true, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "DeepseekV3ForCausalLM" }, "luccafong/deepseek_mtp_draft_random": { - "architectures": [ - "DeepseekV3ForCausalLM" - ], "model_type": "deepseek_v3", "text_model_type": "deepseek_v3", "hidden_size": 2560, @@ -184,12 +161,10 @@ "num_experts": 72, "is_deepseek_mla": true, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "DeepseekV3ForCausalLM" }, "Qwen/Qwen3-Next-80B-A3B-Instruct": { - "architectures": [ - "Qwen3NextForCausalLM" - ], "model_type": "qwen3_next", "text_model_type": "qwen3_next", "hidden_size": 2048, @@ -201,12 +176,10 @@ "num_experts": 512, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "Qwen3NextForCausalLM" }, "tiny-random/qwen3-next-moe": { - "architectures": [ - "Qwen3NextForCausalLM" - ], "model_type": "qwen3_next", "text_model_type": "qwen3_next", "hidden_size": 8, @@ -218,12 +191,10 @@ "num_experts": 32, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "Qwen3NextForCausalLM" }, "zai-org/GLM-4.5": { - "architectures": [ - "Glm4MoeForCausalLM" - ], "model_type": "glm4_moe", "text_model_type": "glm4_moe", "hidden_size": 5120, @@ -235,12 +206,10 @@ "num_experts": 160, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "Glm4MoeForCausalLM" }, "baidu/ERNIE-4.5-21B-A3B-PT": { - "architectures": [ - "Ernie4_5_MoeForCausalLM" - ], "model_type": "ernie4_5_moe", "text_model_type": "ernie4_5_moe", "hidden_size": 2560, @@ -252,12 +221,10 @@ "num_experts": 64, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "Ernie4_5_MoeForCausalLM" }, "lmsys/gpt-oss-20b-bf16": { - "architectures": [ - "GptOssForCausalLM" - ], "model_type": "gpt_oss", "text_model_type": "gpt_oss", "hidden_size": 2880, @@ -269,12 +236,10 @@ "num_experts": 32, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "GptOssForCausalLM" }, "deepseek-ai/DeepSeek-V3.2-Exp": { - "architectures": [ - "DeepseekV32ForCausalLM" - ], "model_type": "deepseek_v32", "text_model_type": "deepseek_v32", "hidden_size": 7168, @@ -286,12 +251,10 @@ "num_experts": 256, "is_deepseek_mla": true, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "DeepseekV32ForCausalLM" }, "meta-llama/Llama-4-Scout-17B-16E-Instruct": { - "architectures": [ - "Llama4ForConditionalGeneration" - ], "model_type": "llama4", "text_model_type": "llama4_text", "hidden_size": 5120, @@ -303,12 +266,10 @@ "num_experts": 16, "is_deepseek_mla": false, "is_multimodal_model": true, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "Llama4ForConditionalGeneration" }, "nvidia/Llama-3_3-Nemotron-Super-49B-v1": { - "architectures": [ - "DeciLMForCausalLM" - ], "model_type": "nemotron-nas", "text_model_type": "nemotron-nas", "hidden_size": 8192, @@ -320,12 +281,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "DeciLMForCausalLM" }, "XiaomiMiMo/MiMo-7B-RL": { - "architectures": [ - "MiMoForCausalLM" - ], "model_type": "mimo", "text_model_type": "mimo", "hidden_size": 4096, @@ -337,12 +296,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "MiMoForCausalLM" }, "meituan-longcat/LongCat-Flash-Chat": { - "architectures": [ - "LongcatFlashForCausalLM" - ], "model_type": "longcat_flash", "text_model_type": "longcat_flash", "hidden_size": 6144, @@ -354,6 +311,7 @@ "num_experts": 512, "is_deepseek_mla": true, "is_multimodal_model": false, - "dtype": "torch.float32" + "dtype": "torch.float32", + "architecture": "LongcatFlashForCausalLM" } -} +} \ No newline at end of file diff --git a/tests/config/draft_model_arch_groundtruth.json b/tests/config/draft_model_arch_groundtruth.json index dfe6f3d39e93..46b2adf545db 100644 --- a/tests/config/draft_model_arch_groundtruth.json +++ b/tests/config/draft_model_arch_groundtruth.json @@ -1,8 +1,5 @@ { "abhigoyal/vllm-medusa-llama-68m-random": { - "architectures": [ - "MedusaModel" - ], "model_type": "medusa", "text_model_type": "medusa", "hidden_size": 768, @@ -14,12 +11,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "torch.float32" + "dtype": "torch.float32", + "architecture": "MedusaModel" }, "luccafong/deepseek_mtp_draft_random": { - "architectures": [ - "DeepSeekMTPModel" - ], "model_type": "deepseek_mtp", "text_model_type": "deepseek_mtp", "hidden_size": 2560, @@ -31,12 +26,10 @@ "num_experts": 72, "is_deepseek_mla": true, "is_multimodal_model": false, - "dtype": "torch.bfloat16" + "dtype": "torch.bfloat16", + "architecture": "DeepSeekMTPModel" }, "eagle618/eagle-deepseek-v3-random": { - "architectures": [ - "EagleDeepSeekMTPModel" - ], "model_type": "eagle", "text_model_type": "deepseek_mtp", "hidden_size": 2560, @@ -48,12 +41,10 @@ "num_experts": 72, "is_deepseek_mla": true, "is_multimodal_model": false, - "dtype": "bfloat16" + "dtype": "bfloat16", + "architecture": "EagleDeepSeekMTPModel" }, "yuhuili/EAGLE-LLaMA3-Instruct-8B": { - "architectures": [ - "EagleLlamaForCausalLM" - ], "model_type": "eagle", "text_model_type": "llama", "hidden_size": 4096, @@ -65,12 +56,10 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "float16" + "dtype": "float16", + "architecture": "EagleLlamaForCausalLM" }, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B": { - "architectures": [ - "Eagle3LlamaForCausalLM" - ], "model_type": "eagle", "text_model_type": "llama", "hidden_size": 4096, @@ -82,6 +71,7 @@ "num_experts": 0, "is_deepseek_mla": false, "is_multimodal_model": false, - "dtype": "float16" + "dtype": "float16", + "architecture": "Eagle3LlamaForCausalLM" } -} +} \ No newline at end of file diff --git a/tests/config/test_model_arch_config.py b/tests/config/test_model_arch_config.py index 06d4c6e7a865..8fa9d137e837 100644 --- a/tests/config/test_model_arch_config.py +++ b/tests/config/test_model_arch_config.py @@ -66,7 +66,7 @@ def _assert_model_arch_config( ): """Assert model_arch_config matches expected values.""" model_arch_config = model_config.model_arch_config - assert model_arch_config.architectures == expected["architectures"] + assert model_arch_config.architecture == expected["architecture"] assert model_arch_config.model_type == expected["model_type"] assert model_arch_config.text_model_type == expected["text_model_type"] assert model_arch_config.hidden_size == expected["hidden_size"] @@ -95,7 +95,7 @@ def _assert_model_config_methods( model_config, expected: dict, check_head_size: bool = True ): """Assert model_config methods return expected values.""" - assert model_config.architectures == expected["architectures"] + assert model_config.architecture == expected["architecture"] assert model_config.get_vocab_size() == expected["vocab_size"] assert model_config.get_hidden_size() == expected["hidden_size"] assert model_config.get_total_num_kv_heads() == expected["total_num_kv_heads"] diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py index 59740505e827..86f06fc2f540 100644 --- a/tests/models/language/generation_ppl_test/ppl_utils.py +++ b/tests/models/language/generation_ppl_test/ppl_utils.py @@ -46,7 +46,9 @@ def wikitext_ppl_test( # Confirm whether vllm is using the correct architecture if model_info.architecture: - assert model_info.architecture in model_config.architectures + assert ( + model_info.architecture == model_config.model_arch_config.architecture + ) max_length = min(model_config.max_model_len - 1, max_length) stride = max_length diff --git a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py index a0b469f93064..a1a269c232ba 100644 --- a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py @@ -159,7 +159,9 @@ def mteb_test_embed_models( # Confirm whether vllm is using the correct architecture if model_info.architecture: - assert model_info.architecture in model_config.architectures + assert ( + model_info.architecture == model_config.model_arch_config.architecture + ) # Confirm whether the important configs in model_config are correct. if model_info.pooling_type is not None: diff --git a/tests/models/language/pooling_mteb_test/mteb_score_utils.py b/tests/models/language/pooling_mteb_test/mteb_score_utils.py index 6c1350231773..153e82fe67d5 100644 --- a/tests/models/language/pooling_mteb_test/mteb_score_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py @@ -204,7 +204,9 @@ def mteb_test_rerank_models( # Confirm whether vllm is using the correct architecture if model_info.architecture: - assert model_info.architecture in model_config.architectures + assert ( + model_info.architecture == model_config.model_arch_config.architecture + ) # Score API is only enabled for num_labels == 1 assert model_config.hf_config.num_labels == 1 diff --git a/vllm/config/model.py b/vllm/config/model.py index de4921f19798..b63427e2501e 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -489,9 +489,9 @@ def __post_init__( ) self.model_arch_config = self.get_model_arch_config() - architecture = self.model_arch_config.architectures[0] + architecture = self.model_arch_config.architecture registry = self.registry - is_generative_model = registry.is_text_generation_model([architecture], self) + is_generative_model = registry.is_text_generation_model(architecture, self) is_pooling_model = registry.is_pooling_model(architecture, self) self.runner_type = self._get_runner_type(architecture, self.runner) @@ -660,7 +660,7 @@ def _get_transformers_backend_cls(self) -> str: # Check if the architecture we're wrapping has defaults runner = None task = None - if defaults := try_match_architecture_defaults(self.architectures[0]): + if defaults := try_match_architecture_defaults(self.architecture): _, (runner, task) = defaults # User specified value take precedence if self.runner != "auto": @@ -687,10 +687,6 @@ def using_transformers_backend(self) -> bool: def registry(self): return me_models.ModelRegistry - @property - def architectures(self) -> list[str]: - return self.model_arch_config.architectures - @property def architecture(self) -> str: """The architecture vllm actually used.""" @@ -791,26 +787,25 @@ def _get_runner_type( def _get_default_convert_type( self, - architectures: list[str], + architecture: str, runner_type: RunnerType, ) -> ConvertType: registry = self.registry - for arch in architectures: - if arch in registry.get_supported_archs(): - if runner_type == "generate" and registry.is_text_generation_model( - architectures, self - ): - return "none" - if runner_type == "pooling" and registry.is_pooling_model( - architectures, self - ): - return "none" + if architecture in registry.get_supported_archs(): + if runner_type == "generate" and registry.is_text_generation_model( + architecture, self + ): + return "none" + if runner_type == "pooling" and registry.is_pooling_model( + architecture, self + ): + return "none" - match = try_match_architecture_defaults(arch, runner_type=runner_type) - if match: - _, (_, convert_type) = match - return convert_type + match = try_match_architecture_defaults(architecture, runner_type=runner_type) + if match: + _, (_, convert_type) = match + return convert_type # This is to handle Sentence Transformers models that use *ForCausalLM # and also multi-modal pooling models which are not defined as @@ -822,7 +817,7 @@ def _get_default_convert_type( def _get_convert_type( self, - architectures: list[str], + architecture: str, runner_type: RunnerType, convert: ConvertOption, ) -> ConvertType: @@ -836,7 +831,7 @@ def _get_convert_type( if convert != "auto": return convert - convert_type = self._get_default_convert_type(architectures, runner_type) + convert_type = self._get_default_convert_type(architecture, runner_type) # Don't log the most common case if convert_type != "none": @@ -1036,7 +1031,7 @@ def verify_with_parallel_config( pipeline_parallel_size = parallel_config.pipeline_parallel_size if pipeline_parallel_size > 1 and not self.registry.is_pp_supported_model( - self.architectures, self + self.architecture, self ): raise NotImplementedError( "Pipeline parallelism is not supported for this model. " @@ -1346,7 +1341,7 @@ def uses_alibi(self) -> bool: return ( getattr(cfg, "alibi", False) # Falcon - or "BloomForCausalLM" in self.architectures # Bloom + or self.architecture == "BloomForCausalLM" # Bloom or getattr(cfg, "position_encoding_type", "") == "alibi" # codellm_1b_alibi or ( hasattr(cfg, "attn_config") # MPT diff --git a/vllm/config/model_arch.py b/vllm/config/model_arch.py index d55e2a3399b3..3df37d37241b 100644 --- a/vllm/config/model_arch.py +++ b/vllm/config/model_arch.py @@ -16,8 +16,8 @@ class ModelArchitectureConfig: Configuration for model architecture that required by vLLM runtime """ - architectures: list[str] | None - """List of model architecture class names (e.g., ['LlamaForCausalLM']). + architecture: str | None + """Model architecture class name (e.g., 'LlamaForCausalLM'). It can be None upon calling `vllm_config.with_hf_config(config.text_config)`""" model_type: str diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 7c6d86b3602f..8001213b1d43 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -617,7 +617,7 @@ def create_lora_weights( """Initializes lora matrices.""" assert isinstance(model_config, PretrainedConfig) - self._base_model = model_config.architectures[0] + self._base_model = model_config.architecture self.max_loras = lora_config.max_loras self.fully_sharded = lora_config.fully_sharded_loras diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index e0e346fcd878..25459c18fa35 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -939,17 +939,15 @@ def _normalize_arch( def inspect_model_cls( self, - architectures: str | list[str], + architecture: str | None, model_config: ModelConfig, ) -> tuple[_ModelInfo, str]: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - raise ValueError("No model architectures are specified") + if not architecture: + raise ValueError("No model architecture is specified") # Require transformers impl if model_config.model_impl == "transformers": - arch = self._try_resolve_transformers(architectures[0], model_config) + arch = self._try_resolve_transformers(architecture, model_config) if arch is not None: model_info = self._try_inspect_model_cls(arch) if model_info is not None: @@ -960,48 +958,42 @@ def inspect_model_cls( # Fallback to transformers impl (after resolving convert_type) if ( - all(arch not in self.models for arch in architectures) + architecture not in self.models and model_config.model_impl == "auto" and getattr(model_config, "convert_type", "none") == "none" ): - arch = self._try_resolve_transformers(architectures[0], model_config) + arch = self._try_resolve_transformers(architecture, model_config) if arch is not None: model_info = self._try_inspect_model_cls(arch) if model_info is not None: return (model_info, arch) - for arch in architectures: - normalized_arch = self._normalize_arch(arch, model_config) - model_info = self._try_inspect_model_cls(normalized_arch) - if model_info is not None: - return (model_info, arch) + normalized_arch = self._normalize_arch(architecture, model_config) + model_info = self._try_inspect_model_cls(normalized_arch) + if model_info is not None: + return (model_info, architecture) # Fallback to transformers impl (before resolving runner_type) - if ( - all(arch not in self.models for arch in architectures) - and model_config.model_impl == "auto" - ): - arch = self._try_resolve_transformers(architectures[0], model_config) + if architecture not in self.models and model_config.model_impl == "auto": + arch = self._try_resolve_transformers(architecture, model_config) if arch is not None: model_info = self._try_inspect_model_cls(arch) if model_info is not None: return (model_info, arch) - return self._raise_for_unsupported(architectures) + return self._raise_for_unsupported([architecture]) def resolve_model_cls( self, - architectures: str | list[str], + architecture: str | None, model_config: ModelConfig, ) -> tuple[type[nn.Module], str]: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - raise ValueError("No model architectures are specified") + if not architecture: + raise ValueError("No model architecture is specified") # Require transformers impl if model_config.model_impl == "transformers": - arch = self._try_resolve_transformers(architectures[0], model_config) + arch = self._try_resolve_transformers(architecture, model_config) if arch is not None: model_cls = self._try_load_model_cls(arch) if model_cls is not None: @@ -1014,34 +1006,30 @@ def resolve_model_cls( # Fallback to transformers impl (after resolving convert_type) if ( - all(arch not in self.models for arch in architectures) + architecture not in self.models and model_config.model_impl == "auto" and getattr(model_config, "convert_type", "none") == "none" ): - arch = self._try_resolve_transformers(architectures[0], model_config) + arch = self._try_resolve_transformers(architecture, model_config) if arch is not None: model_cls = self._try_load_model_cls(arch) if model_cls is not None: return (model_cls, arch) - for arch in architectures: - normalized_arch = self._normalize_arch(arch, model_config) - model_cls = self._try_load_model_cls(normalized_arch) - if model_cls is not None: - return (model_cls, arch) + normalized_arch = self._normalize_arch(architecture, model_config) + model_cls = self._try_load_model_cls(normalized_arch) + if model_cls is not None: + return (model_cls, architecture) # Fallback to transformers impl (before resolving runner_type) - if ( - all(arch not in self.models for arch in architectures) - and model_config.model_impl == "auto" - ): - arch = self._try_resolve_transformers(architectures[0], model_config) + if architecture not in self.models and model_config.model_impl == "auto": + arch = self._try_resolve_transformers(architecture, model_config) if arch is not None: model_cls = self._try_load_model_cls(arch) if model_cls is not None: return (model_cls, arch) - return self._raise_for_unsupported(architectures) + return self._raise_for_unsupported([architecture]) def is_text_generation_model( self, diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 31db9d682bd4..77af24031659 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -224,7 +224,7 @@ def forward(self, *args, **kwargs): # MoE activation function activation = "silu" - wrapped_arch = self.config.architectures[0].lower() + wrapped_arch = self.config.architecture.lower() if "gptoss" in wrapped_arch: activation = "swigluoai" elif "grok1" in wrapped_arch: diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index dc067a09419b..f6cf0481cd89 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -26,8 +26,11 @@ def __init__(self, hf_config: PretrainedConfig, hf_text_config: PretrainedConfig self.hf_config = hf_config self.hf_text_config = hf_text_config - def get_architectures(self) -> list[str]: - return getattr(self.hf_config, "architectures", []) + def get_architecture(self) -> str | None: + architectures = getattr(self.hf_config, "architectures", None) + if architectures: + return architectures[0] + return None def get_num_hidden_layers(self) -> int: return getattr(self.hf_text_config, "num_hidden_layers", 0) @@ -243,7 +246,7 @@ def derive_max_model_len_and_key(self) -> tuple[float, str | None]: def convert(self) -> ModelArchitectureConfig: model_arch_config = ModelArchitectureConfig( - architectures=self.get_architectures(), + architecture=self.get_architecture(), model_type=self.hf_config.model_type, text_model_type=getattr(self.hf_text_config, "model_type", None), hidden_size=self.get_hidden_size(), @@ -336,7 +339,7 @@ def get_total_num_kv_heads(self) -> int: raise RuntimeError( "Could not determine the number of key-value attention heads " "from model configuration. " - f"Architecture: {self.get_architectures()}. " + f"Architecture: {self.get_architecture()}. " "This usually indicates an unsupported model architecture or " "missing configuration. " "Please check if your model is supported at: " From e75c89bda5f6f39af5ad87778ea98c79d81ba955 Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Fri, 2 Jan 2026 15:58:46 -0800 Subject: [PATCH 03/11] assertion for architectures Signed-off-by: Xingyu Liu --- vllm/config/model.py | 3 +++ vllm/transformers_utils/model_arch_config_convertor.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/vllm/config/model.py b/vllm/config/model.py index b63427e2501e..8f905341bc8c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -519,6 +519,9 @@ def __post_init__( model_info, arch = registry.inspect_model_cls(architecture, self) self._model_info = model_info self._architecture = arch + assert architecture == arch, ( + f"vllm inspected {arch=}, and is different from config {architecture=}" + ) logger.info("Resolved architecture: %s", arch) # Init pooler config if needed diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index f6cf0481cd89..46c39a4c7bbb 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -29,6 +29,9 @@ def __init__(self, hf_config: PretrainedConfig, hf_text_config: PretrainedConfig def get_architecture(self) -> str | None: architectures = getattr(self.hf_config, "architectures", None) if architectures: + assert len(architectures) == 1, ( + f"len(architectures) should be 1, got {len(architectures)}" + ) return architectures[0] return None From bb9e623dbfd21fbd357be3bf1af9b0c23dfd4a86 Mon Sep 17 00:00:00 2001 From: xingyuliu Date: Fri, 2 Jan 2026 15:58:46 -0800 Subject: [PATCH 04/11] assertion for architectures Signed-off-by: Xingyu Liu --- vllm/model_executor/models/transformers/moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 77af24031659..31db9d682bd4 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -224,7 +224,7 @@ def forward(self, *args, **kwargs): # MoE activation function activation = "silu" - wrapped_arch = self.config.architecture.lower() + wrapped_arch = self.config.architectures[0].lower() if "gptoss" in wrapped_arch: activation = "swigluoai" elif "grok1" in wrapped_arch: From 1ecc27b7ad56cfac7e010203d743432c4681e5bd Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Sat, 3 Jan 2026 23:27:24 -0800 Subject: [PATCH 05/11] do not check model_config.architecture Signed-off-by: Xingyu Liu --- tests/config/test_model_arch_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/config/test_model_arch_config.py b/tests/config/test_model_arch_config.py index 8fa9d137e837..2dfef21d5d6f 100644 --- a/tests/config/test_model_arch_config.py +++ b/tests/config/test_model_arch_config.py @@ -95,7 +95,6 @@ def _assert_model_config_methods( model_config, expected: dict, check_head_size: bool = True ): """Assert model_config methods return expected values.""" - assert model_config.architecture == expected["architecture"] assert model_config.get_vocab_size() == expected["vocab_size"] assert model_config.get_hidden_size() == expected["hidden_size"] assert model_config.get_total_num_kv_heads() == expected["total_num_kv_heads"] From 4d3e5ca9f85a1c29e79597bcb663195d917b47c9 Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Sat, 3 Jan 2026 23:37:09 -0800 Subject: [PATCH 06/11] do not change registry Signed-off-by: Xingyu Liu --- vllm/model_executor/models/registry.py | 64 +++++++++++++++----------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 25459c18fa35..e0e346fcd878 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -939,15 +939,17 @@ def _normalize_arch( def inspect_model_cls( self, - architecture: str | None, + architectures: str | list[str], model_config: ModelConfig, ) -> tuple[_ModelInfo, str]: - if not architecture: - raise ValueError("No model architecture is specified") + if isinstance(architectures, str): + architectures = [architectures] + if not architectures: + raise ValueError("No model architectures are specified") # Require transformers impl if model_config.model_impl == "transformers": - arch = self._try_resolve_transformers(architecture, model_config) + arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_info = self._try_inspect_model_cls(arch) if model_info is not None: @@ -958,42 +960,48 @@ def inspect_model_cls( # Fallback to transformers impl (after resolving convert_type) if ( - architecture not in self.models + all(arch not in self.models for arch in architectures) and model_config.model_impl == "auto" and getattr(model_config, "convert_type", "none") == "none" ): - arch = self._try_resolve_transformers(architecture, model_config) + arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_info = self._try_inspect_model_cls(arch) if model_info is not None: return (model_info, arch) - normalized_arch = self._normalize_arch(architecture, model_config) - model_info = self._try_inspect_model_cls(normalized_arch) - if model_info is not None: - return (model_info, architecture) + for arch in architectures: + normalized_arch = self._normalize_arch(arch, model_config) + model_info = self._try_inspect_model_cls(normalized_arch) + if model_info is not None: + return (model_info, arch) # Fallback to transformers impl (before resolving runner_type) - if architecture not in self.models and model_config.model_impl == "auto": - arch = self._try_resolve_transformers(architecture, model_config) + if ( + all(arch not in self.models for arch in architectures) + and model_config.model_impl == "auto" + ): + arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_info = self._try_inspect_model_cls(arch) if model_info is not None: return (model_info, arch) - return self._raise_for_unsupported([architecture]) + return self._raise_for_unsupported(architectures) def resolve_model_cls( self, - architecture: str | None, + architectures: str | list[str], model_config: ModelConfig, ) -> tuple[type[nn.Module], str]: - if not architecture: - raise ValueError("No model architecture is specified") + if isinstance(architectures, str): + architectures = [architectures] + if not architectures: + raise ValueError("No model architectures are specified") # Require transformers impl if model_config.model_impl == "transformers": - arch = self._try_resolve_transformers(architecture, model_config) + arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_cls = self._try_load_model_cls(arch) if model_cls is not None: @@ -1006,30 +1014,34 @@ def resolve_model_cls( # Fallback to transformers impl (after resolving convert_type) if ( - architecture not in self.models + all(arch not in self.models for arch in architectures) and model_config.model_impl == "auto" and getattr(model_config, "convert_type", "none") == "none" ): - arch = self._try_resolve_transformers(architecture, model_config) + arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_cls = self._try_load_model_cls(arch) if model_cls is not None: return (model_cls, arch) - normalized_arch = self._normalize_arch(architecture, model_config) - model_cls = self._try_load_model_cls(normalized_arch) - if model_cls is not None: - return (model_cls, architecture) + for arch in architectures: + normalized_arch = self._normalize_arch(arch, model_config) + model_cls = self._try_load_model_cls(normalized_arch) + if model_cls is not None: + return (model_cls, arch) # Fallback to transformers impl (before resolving runner_type) - if architecture not in self.models and model_config.model_impl == "auto": - arch = self._try_resolve_transformers(architecture, model_config) + if ( + all(arch not in self.models for arch in architectures) + and model_config.model_impl == "auto" + ): + arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_cls = self._try_load_model_cls(arch) if model_cls is not None: return (model_cls, arch) - return self._raise_for_unsupported([architecture]) + return self._raise_for_unsupported(architectures) def is_text_generation_model( self, From 003bdd82ac3dea4963a6bb7665d8d9ee7fff31fd Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Wed, 14 Jan 2026 11:29:28 -0800 Subject: [PATCH 07/11] fix speculative Signed-off-by: Xingyu Liu --- vllm/config/speculative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 36e6447124f8..543414e70134 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -421,7 +421,7 @@ def __post_init__(self): ) model_info, arch = ( self.draft_model_config.registry.inspect_model_cls( - self.draft_model_config.architectures, + self.draft_model_config.architecture, self.draft_model_config, ) ) From 283c5e7015787ca7836f8b5d541c6a5a0d24493c Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Wed, 14 Jan 2026 11:33:36 -0800 Subject: [PATCH 08/11] fix fused_moe.py Signed-off-by: Xingyu Liu --- vllm/lora/layers/fused_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 18a4a9c026f3..9e78b6164909 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -620,7 +620,7 @@ def create_lora_weights( """Initializes lora matrices.""" assert isinstance(model_config, PretrainedConfig) - self._base_model = model_config.architecture + self._base_model = model_config.architectures[0] self.max_loras = lora_config.max_loras self.fully_sharded = lora_config.fully_sharded_loras From 09fe877071c313f7c642c34e0a41ade86c2b5dc2 Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Wed, 14 Jan 2026 11:35:36 -0800 Subject: [PATCH 09/11] fix speculative Signed-off-by: Xingyu Liu --- vllm/config/speculative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 543414e70134..55f1e99f160d 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -421,7 +421,7 @@ def __post_init__(self): ) model_info, arch = ( self.draft_model_config.registry.inspect_model_cls( - self.draft_model_config.architecture, + self.draft_model_config.model_arch_config.architecture, self.draft_model_config, ) ) From 41c71ab1d8a7409492090012b20da72f0f7878f5 Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Tue, 27 Jan 2026 16:23:57 -0800 Subject: [PATCH 10/11] fix tests Signed-off-by: Xingyu Liu --- vllm/config/model.py | 7 ++++--- vllm/config/speculative.py | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index e66fea04238b..737ab7ded9f2 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -537,7 +537,6 @@ def __post_init__( # may fail to load dynamic modules in child processes model_info, arch = registry.inspect_model_cls(architecture, self) self._model_info = model_info - self._architecture = arch assert architecture == arch, ( f"vllm inspected {arch=}, and is different from config {architecture=}" ) @@ -686,7 +685,9 @@ def _get_transformers_backend_cls(self) -> str: # Check if the architecture we're wrapping has defaults runner = None task = None - if defaults := try_match_architecture_defaults(self.architecture): + if defaults := try_match_architecture_defaults( + self.model_arch_config.architecture + ): _, (runner, task) = defaults # User specified value take precedence if self.runner != "auto": @@ -716,7 +717,7 @@ def registry(self): @property def architecture(self) -> str: """The architecture vllm actually used.""" - return self._architecture + return self.model_arch_config.architecture def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None: """Pull model/tokenizer from Object Storage to temporary diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 12fc3e348f01..fd7e7c83b01c 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -451,7 +451,9 @@ def __post_init__(self): ) ) self.draft_model_config._model_info = model_info - self.draft_model_config._architecture = arch + assert arch == self.draft_model_config.architecture, ( + f"{arch} != {self.draft_model_config.architecture}" + ) if self.num_speculative_tokens is not None and hasattr( self.draft_model_config.hf_config, "num_lookahead_tokens" From 1c300d89a1c838f6f6a10af75555647a676bce82 Mon Sep 17 00:00:00 2001 From: Xingyu Liu Date: Thu, 29 Jan 2026 15:23:28 -0800 Subject: [PATCH 11/11] fixes Signed-off-by: Xingyu Liu --- tests/test_config.py | 4 +++- vllm/config/model.py | 10 ++++------ vllm/config/model_arch.py | 5 +++-- vllm/config/speculative.py | 4 +--- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index 8c1bf6c40424..9b4a247f9841 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1125,5 +1125,7 @@ def test_eagle_draft_model_config(): assert draft_model_config.hf_text_config.architectures == ["EagleLlamaForCausalLM"] assert draft_model_config.hf_config.model_type == "eagle" assert draft_model_config.hf_text_config.model_type == "eagle" - assert draft_model_config.architectures == ["EagleLlamaForCausalLM"] + assert draft_model_config.model_arch_config.architecture == [ + "EagleLlamaForCausalLM" + ] assert draft_model_config.architecture == "EagleLlamaForCausalLM" diff --git a/vllm/config/model.py b/vllm/config/model.py index 737ab7ded9f2..1cc0a62a2950 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -537,9 +537,7 @@ def __post_init__( # may fail to load dynamic modules in child processes model_info, arch = registry.inspect_model_cls(architecture, self) self._model_info = model_info - assert architecture == arch, ( - f"vllm inspected {arch=}, and is different from config {architecture=}" - ) + self._architecture = arch logger.info("Resolved architecture: %s", arch) # Init pooler config if needed @@ -717,7 +715,7 @@ def registry(self): @property def architecture(self) -> str: """The architecture vllm actually used.""" - return self.model_arch_config.architecture + return self._architecture def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None: """Pull model/tokenizer from Object Storage to temporary @@ -1077,7 +1075,7 @@ def verify_with_parallel_config( pipeline_parallel_size = parallel_config.pipeline_parallel_size if pipeline_parallel_size > 1 and not self.registry.is_pp_supported_model( - self.architecture, self + self.model_arch_config.architecture, self ): raise NotImplementedError( "Pipeline parallelism is not supported for this model. " @@ -1388,7 +1386,7 @@ def uses_alibi(self) -> bool: return ( getattr(cfg, "alibi", False) # Falcon - or self.architecture == "BloomForCausalLM" # Bloom + or self.model_arch_config.architecture == "BloomForCausalLM" # Bloom or getattr(cfg, "position_encoding_type", "") == "alibi" # codellm_1b_alibi or ( hasattr(cfg, "attn_config") # MPT diff --git a/vllm/config/model_arch.py b/vllm/config/model_arch.py index 3df37d37241b..5eb53b9276d0 100644 --- a/vllm/config/model_arch.py +++ b/vllm/config/model_arch.py @@ -17,8 +17,9 @@ class ModelArchitectureConfig: """ architecture: str | None - """Model architecture class name (e.g., 'LlamaForCausalLM'). - It can be None upon calling `vllm_config.with_hf_config(config.text_config)`""" + """Model architecture class name (e.g., 'LlamaForCausalLM') from config. + When using transformer backend, this may differ from + the actual model class used by vLLM.""" model_type: str """Model type identifier (e.g., 'llama', 'gpt_oss').""" diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index fd7e7c83b01c..12fc3e348f01 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -451,9 +451,7 @@ def __post_init__(self): ) ) self.draft_model_config._model_info = model_info - assert arch == self.draft_model_config.architecture, ( - f"{arch} != {self.draft_model_config.architecture}" - ) + self.draft_model_config._architecture = arch if self.num_speculative_tokens is not None and hasattr( self.draft_model_config.hf_config, "num_lookahead_tokens"